Пример #1
0
def summarize(thistask, club_id, sources, status, summaryfile, detailfile, resultsurl, minage=12, minagegrade=20, minraces=3 , mintrend=2, numyears=3, begindate=None, enddate=None):
#----------------------------------------------------------------------
    '''
    render collected results

    :param thistask: this is required for task thistask.update_state()
    :param club_id: identifies club for which results are to be stored
    :param sources: list of sources / services we're keeping status for
    :param summaryfile: summary file name (.csv)
    :param detailfile: detail file name (.csv)
    :param resultsurl: base url to send results to, for link in summary table
    :param minage: minimum age to keep track of stats
    :param minagegrade: minimum age grade
    :param minraces: minimum races in the same year as enddate
    :param mintrend: minimum races over the full period for trendline
    :param begindate: render races between begindate and enddate, datetime
    :param enddate: render races between begindate and enddate, datetime
    '''
    
    # get club slug and location for later
    club = Club.query.filter_by(id=club_id).first()
    clubslug = club.shname
    locsvr = LocationServer()
    clublocation = locsvr.getlocation(club.location)

    # get maxdistance by service
    services = RaceResultService.query.filter_by(club_id=club_id).join(ApiCredentials).all()
    maxdistance = {}
    for service in services:
        attrs = ServiceAttributes(club_id, service.apicredentials.name)
        # app.logger.debug('service {} attrs {}'.format(service, attrs.__dict__))
        if attrs.maxdistance:
            maxdistance[service.apicredentials.name] = attrs.maxdistance
        else:
            maxdistance[service.apicredentials.name] = None
    maxdistance[productname] = None

    # set up date range. begindate and enddate take precedence, else use numyears from today
    if not (begindate and enddate):
        etoday = time.time()
        today = timeu.epoch2dt(etoday)
        begindate = datetime(today.year-numyears+1,1,1)
        enddate = datetime(today.year,12,31)

    firstyear = begindate.year
    lastyear = enddate.year
    yearrange = range(firstyear,lastyear+1)
    
    # get all the requested result data from the database and save in a data structure indexed by runner
    ## first get the data from the database
    results = RaceResult.query.join(Race).join(Runner).filter(RaceResult.club_id==club_id, 
                Race.date.between(ftime.dt2asc(begindate), ftime.dt2asc(enddate)), Runner.member==True, Runner.active==True).order_by(Runner.lname, Runner.fname).all()

    ## then set up our status and pass to the front end
    for source in sources:
        status[source]['status'] = 'summarizing'
        status[source]['lastname'] = ''
        status[source]['processed'] = 0
        status[source]['total'] = sum([1 for result in results if result.source==source])
    thistask.update_state(state='PROGRESS', meta={'progress':status})
    
    ## prepare to save detail file, for debugging
    detlfields = 'runnername,runnerid,dob,gender,resultid,racename,racedate,series,distmiles,distkm,time,timesecs,agpercent,source,sourceid'.split(',')
    detailfname = detailfile
    _DETL = open(detailfname,'wb')
    DETL = csv.DictWriter(_DETL,detlfields)
    DETL.writeheader()

    ## then fill in data structure to hold AnalyzeAgeGrade objects
    ## use OrderedDict to force aag to be in same order as DETL file, for debugging
    aag = collections.OrderedDict()
    for result in results:
        # skip results which are too far away, if a maxdistance is defined for this source
        if maxdistance[result.source]:
            locationid = result.race.locationid
            if not locationid: continue
            racelocation = Location.query.filter_by(id=locationid).first()
            distance = get_distance(clublocation, racelocation)
            if distance == None or distance > maxdistance[result.source]: continue

        thisname = (result.runner.name.lower(), result.runner.dateofbirth)
        initaagrunner(aag, thisname, result.runner.fname, result.runner.lname, result.runner.gender, ftime.asc2dt(result.runner.dateofbirth), result.runner.id)
        
        # determine location name. any error gets null string
        locationname = ''
        if result.race.locationid:
            location = Location.query.filter_by(id=result.race.locationid).first()
            if location: 
                locationname = location.name

        thisstat = aag[thisname].add_stat(ftime.asc2dt(result.race.date), result.race.distance*METERSPERMILE, result.time, race=result.race.name,
                               loc=locationname, fuzzyage=result.fuzzyage,
                               source=result.source, priority=priority[result.source])

        ### TODO: store result's agpercent, in AgeGrade.crunch() skip agegrade calculation if already present
        DETL.writerow(dict(
                runnername = result.runner.name,
                runnerid = result.runner.id,
                dob = result.runner.dateofbirth,
                gender = result.runner.gender,
                resultid = result.id,
                racename = result.race.name,
                racedate = result.race.date,
                series = result.series.name if result.seriesid else None,
                distmiles = result.race.distance,
                distkm = result.race.distance*(METERSPERMILE/1000),
                timesecs = result.time,
                time = rendertime(result.time,0),
                agpercent = result.agpercent,
                source = result.source,
                sourceid = result.sourceid,
            ))

    ## close detail file
    _DETL.close()

    # initialize summary file
    summfields = ['name', 'lname', 'fname', 'age', 'gender']
    datafields = copy(summfields)
    distcategories = ['overall'] + [TRENDLIMITS[tlimit][0] for tlimit in TRENDLIMITS]
    datacategories = ['overall'] + [TRENDLIMITS[tlimit][1] for tlimit in TRENDLIMITS]
    stattypes = ['1yr agegrade','avg agegrade','trend','numraces','stderr','r-squared','pvalue']
    statdatatypes = ['1yr-agegrade','avg-agegrade','trend','numraces','stderr','r-squared','pvalue']
    for stattype, statdatatype in zip(stattypes, statdatatypes):
        for distcategory, datacategory in zip(distcategories, datacategories):
            summfields.append('{}\n{}'.format(stattype, distcategory))
            datafields.append('{}-{}'.format(statdatatype, datacategory))
        if stattype == 'numraces':
            for year in yearrange:
                summfields.append('{}\n{}'.format(stattype, year))
                datafields.append('{}-{}'.format(statdatatype, lastyear-year))

    # save summary file columns for resultsanalysissummary
    dtcolumns = json.dumps([{ 'data':d, 'name':d, 'label':l } for d,l in zip(datafields, summfields)])
    columnsfilename = summaryfile + '.cols'
    with open(columnsfilename, 'w') as cols:
        cols.write(dtcolumns)

    # set up summary file
    summaryfname = summaryfile
    _SUMM = open(summaryfname,'wb')
    SUMM = csv.DictWriter(_SUMM,summfields)
    SUMM.writeheader()
    
    # loop through each member we've recorded information about
    for thisname in aag:
        fullname, fname, lname, gender, dob, runnerid = aag[thisname].get_runner()
        rendername = fullname.title()
        
        # check stats before deduplicating
        statcount = {}
        stats = aag[thisname].get_stats()
        for source in sources:
            statcount[source] = sum([1 for s in stats if s.source == source])

        # remove duplicate entries
        aag[thisname].deduplicate()   
        
        # crunch the numbers
        aag[thisname].crunch()    # calculate age grade for each result
        stats = aag[thisname].get_stats()
        
        jan1 = ftime.asc2dt('{}-1-1'.format(lastyear))
        runnerage = timeu.age(jan1, dob)
        
        # filter out runners younger than allowed
        if runnerage < minage: continue

        # filter out runners who have not run enough races
        stats = aag[thisname].get_stats()
        if enddate:
            lastyear = enddate.year
        else:
            lastyear = timeu.epoch2dt(time.time()).year
        lastyearstats = [s for s in stats if s.date.year==lastyear]
        if len(lastyearstats) < minraces: continue
        
        # fill in row for summary output
        summout = {}

        # get link for this runner's results chart
        # see http://stackoverflow.com/questions/2506379/add-params-to-given-url-in-python
        url_parts = list(urlparse(resultsurl))
        query = dict(parse_qsl(url_parts[4]))
        query.update({'club': clubslug, 'runnerid': runnerid, 'begindate': ftime.dt2asc(begindate), 'enddate': ftime.dt2asc(enddate)})
        url_parts[4] = urlencode(query)
        resultslink = urlunparse(url_parts)

        summout['name'] = '<a href={} target=_blank>{}</a>'.format(resultslink, rendername)
        summout['fname'] = fname
        summout['lname'] = lname
        summout['age'] = runnerage
        summout['gender'] = gender
        
        # set up to collect averages
        avg = collections.OrderedDict()

        # draw trendlines, write output
        allstats = aag[thisname].get_stats()
        if len(allstats) > 0:
            avg['overall'] = mean([s.ag for s in allstats])
        trend = aag[thisname].get_trendline()

        oneyrstats = [s.ag for s in allstats if s.date.year == lastyear]
        if len(oneyrstats) > 0:
            summout['1yr agegrade\noverall'] = mean(oneyrstats)
        if len(allstats) > 0:
            summout['avg agegrade\noverall'] = avg['overall']
        if len(allstats) >= mintrend and allstats[0].date != allstats[-1].date:
            summout['trend\noverall'] = trend.improvement
            summout['stderr\noverall'] = trend.stderr
            summout['r-squared\noverall'] = trend.r2**2
            summout['pvalue\noverall'] = trend.pvalue
        summout['numraces\noverall'] = len(allstats)
        for year in yearrange:
            summout['numraces\n{}'.format(year)] = len([s for s in allstats if s.date.year==year])
        for tlimit in TRENDLIMITS:
            distcategory,distcolor = TRENDLIMITS[tlimit]
            tstats = [s for s in allstats if s.dist >= tlimit[0] and s.dist < tlimit[1]]
            if len(tstats) > 0:
                avg[distcategory] = mean([s.ag for s in tstats])
                summout['avg agegrade\n{}'.format(distcategory)] = avg[distcategory]
            summout['numraces\n{}'.format(distcategory)] = len(tstats)
            oneyrcategory = [s.ag for s in tstats if s.date.year == lastyear]
            if len(oneyrcategory) > 0:
                summout['1yr agegrade\n{}'.format(distcategory)] = mean(oneyrcategory)
            if len(tstats) >= mintrend and tstats[0].date != tstats[-1].date:
                try:
                    trend = aag[thisname].get_trendline(thesestats=tstats)
                except ZeroDivisionError:
                    app.logger.debug('ZeroDivisionError - processing {}'.format(rendername))
                    trend = None
                # ignore trends which can't be calculated
                if trend:
                    summout['trend\n{}'.format(distcategory)] = trend.improvement
                    summout['stderr\n{}'.format(distcategory)] = trend.stderr
                    summout['r-squared\n{}'.format(distcategory)] = trend.r2
                    summout['pvalue\n{}'.format(distcategory)] = trend.pvalue
        SUMM.writerow(summout)

        # update status
        for source in sources:
            status[source]['processed'] += statcount[source]
            status[source]['lastname'] = rendername
        thistask.update_state(state='PROGRESS', meta={'progress':status})

        
    _SUMM.close()
Пример #2
0
class AthlinksCollect(CollectServiceResults):
########################################################################

    #----------------------------------------------------------------------
    def __init__(self):
    #----------------------------------------------------------------------
        '''
        initialize object instance

        may be overridden when ResultsCollect is instantiated, but overriding method must call
        `super(<subclass>, self).__init__(servicename, resultfilehdr, resultattrs)`

        '''
        
        super(AthlinksCollect, self).__init__('athlinks', resultfilehdr, resultattrs)


    #----------------------------------------------------------------------
    def openservice(self, club_id):
    #----------------------------------------------------------------------
        '''
        initialize service
        recommended that the overriding method save service instance in `self.service`

        must be overridden when ResultsCollect is instantiated

        :param club_id: club.id for club this service is operating on
        '''
        # create location server
        self.locsvr = LocationServer()

        # remember club id we're working on
        self.club_id = club_id

        # debug file for races saved
        # set debugrace to False if not debugging
        debugrace = True
        if debugrace:
            clubslug = Club.query.filter_by(id=club_id).first().shname
            self.racefile = '{}/{}-athlinks-race.csv'.format(app.config['MEMBERSHIP_DIR'], clubslug)
        else:
            self.racefile = None

        if self.racefile:
            self._RACE = open(self.racefile, 'wb')
            self.racefields = 'id,name,date,distmiles,status,runner'.split(',')
            self.RACE = csv.DictWriter(self._RACE, self.racefields)
            self.RACE.writeheader()

        # open service
        key = ApiCredentials.query.filter_by(name=self.servicename).first().key
        self.service = athlinks.Athlinks(debug=True, key=key)


    #----------------------------------------------------------------------
    def getresults(self, name, fname, lname, gender, dt_dob, begindate, enddate):
    #----------------------------------------------------------------------
        '''
        retrieves a list of results for a single name

        must be overridden when ResultsCollect is instantiated

        use dt_dob to filter errant race results, based on age of runner on race day

        :param name: name of participant for which results are to be returned
        :param fname: first name of participant
        :param lname: last name of participant
        :param gender: 'M' or 'F'
        :param dt_dob: participant's date of birth, as datetime 
        :param begindate: epoch time for start of results, 00:00:00 on date to begin
        :param end: epoch time for end of results, 23:59:59 on date to finish
        :rtype: list of serviceresults, each of which can be processed by convertresult
        '''
        
        # remember participant data
        self.name = name
        self.fname = fname
        self.lname = lname
        self.gender = gender
        self.dt_dob = dt_dob
        self.dob = ftime.dt2asc(dt_dob)

        # get results for this athlete
        allresults = self.service.listathleteresults(name)

        # filter by date and by age
        filteredresults = []
        for result in allresults:
            e_racedate = athlinks.gettime(result['Race']['RaceDate'])
            
            # skip result if outside the desired time window
            if e_racedate < begindate or e_racedate > enddate: continue

            # skip result if wrong gender
            resultgen = result['Gender'][0]
            if resultgen != gender: continue

            # skip result if runner's age doesn't match the age within the result
            # sometimes athlinks stores the age group of the runner, not exact age,
            # so also check if this runner's age is within the age group, and indicate if so
            dt_racedate = timeu.epoch2dt(e_racedate)
            racedateage = timeu.age(dt_racedate,dt_dob)
            resultage = int(result['Age'])
            result['fuzzyage'] = False
            if resultage != racedateage:
                # if results are not stored as age group, skip this result
                if (resultage/5)*5 != resultage:
                    continue
                # result's age might be age group, not exact age
                else:
                    # if runner's age consistent with race age, use result, but mark "fuzzy"
                    if (racedateage/5)*5 == resultage:
                        result['fuzzyage'] = True
                    # otherwise skip result
                    else:
                        continue

            # if we reach here, the result is ok, and is added to filteredresults
            filteredresults.append(result)

        # back to caller
        return filteredresults


    #----------------------------------------------------------------------
    def convertserviceresult(self, result):
    #----------------------------------------------------------------------
        '''
        converts a single service result to dict suitable to be saved in resultfile

        result must be converted to dict with keys in `resultfilehdr` provided at instance creation

        must be overridden when ResultsCollect is instantiated

        use return value of None for cases when results could not be filtered by `:meth:getresults`

        :param fname: participant's first name
        :param lname: participant's last name
        :param result: single service result, from list retrieved through `getresults`
        :rtype: dict with keys matching `resultfilehdr`, or None if result is not to be saved
        '''

        # create output record and copy common fields
        outrec = {}

        # copy participant information
        outrec['name'] = self.name
        outrec['GivenName'] = self.fname
        outrec['FamilyName'] = self.lname
        outrec['DOB'] = self.dob
        outrec['Gender'] = self.gender


        # some debug items - assume everything is cached
        coursecached = True
        racecached = True

        # get course used for this result
        courseid = '{}/{}'.format(result['Race']['RaceID'], result['CourseID'])
        course = Course.query.filter_by(club_id=self.club_id, source='athlinks', sourceid=courseid).first()

        # cache course if not done already
        race = None
        if not course:
            coursecached = False

            coursedata = self.service.getcourse(result['Race']['RaceID'], result['CourseID'])

            distmiles = athlinks.dist2miles(coursedata['Courses'][0]['DistUnit'],coursedata['Courses'][0]['DistTypeID'])
            distkm = athlinks.dist2km(coursedata['Courses'][0]['DistUnit'],coursedata['Courses'][0]['DistTypeID'])
            if distkm < 0.050: return None # skip timed events, which seem to be recorded with 0 distance

            # skip result if not Running or Trail Running race
            thiscategory = coursedata['Courses'][0]['RaceCatID']
            if thiscategory not in race_category: return None
        
            course = Course()
            course.club_id = self.club_id
            course.source = 'athlinks'
            course.sourceid = courseid

            # strip racename and coursename here to make sure detail file matches what is stored in database
            racename = csvu.unicode2ascii(coursedata['RaceName']).strip()
            coursename = csvu.unicode2ascii(coursedata['Courses'][0]['CourseName']).strip()
            course.name = '{} / {}'.format(racename,coursename)

            # maybe truncate to FIRST part of race name
            if len(course.name) > MAX_RACENAME_LEN:
                course.name = course.name[:MAX_RACENAME_LEN]
            
            course.date = ftime.epoch2asc(athlinks.gettime(coursedata['RaceDate']))
            course.location = csvu.unicode2ascii(coursedata['Home'])
            # maybe truncate to LAST part of location name, to keep most relevant information (state, country)
            if len(course.location) > MAX_LOCATION_LEN:
                course.location = course.location[-MAX_LOCATION_LEN:]

            # TODO: adjust marathon and half marathon distances?
            course.distkm =distkm
            course.distmiles = distmiles

            course.surface = race_category[thiscategory]

            # retrieve or add race
            # flush should allow subsequent query per http://stackoverflow.com/questions/4201455/sqlalchemy-whats-the-difference-between-flush-and-commit
            # Race has uniqueconstraint for club_id/name/year/fixeddist. It's been seen that there are additional races in athlinks, 
            # but just assume the first is the correct one.
            raceyear = ftime.asc2dt(course.date).year
            race = Race.query.filter_by(club_id=self.club_id, name=course.name, year=raceyear, fixeddist=race_fixeddist(course.distmiles)).first()
            ### TODO: should the above be .all() then check for first race within epsilon distance?
            if not race:
                racecached = False
                race = Race(self.club_id, raceyear)
                race.name = course.name
                race.distance = course.distmiles
                race.fixeddist = race_fixeddist(race.distance)
                race.date = course.date
                race.active = True
                race.external = True
                race.surface = course.surface
                loc = self.locsvr.getlocation(course.location)
                race.locationid = loc.id
                db.session.add(race)
                db.session.flush()  # force id to be created

            course.raceid = race.id
            db.session.add(course)
            db.session.flush()      # force id to be created

        # maybe course was cached but location of race wasn't
        # update location of result race, if needed, and if supplied
        # this is here to clean up old database data
        if not race:
            race = Race.query.filter_by(club_id=self.club_id, name=course.name, year=ftime.asc2dt(course.date).year, fixeddist=race_fixeddist(course.distmiles)).first()
        if not race.locationid and course.location:
            # app.logger.debug('updating race with location {}'.format(course.location))
            loc = self.locsvr.getlocation(course.location)
            race.locationid = loc.id
            insert_or_update(db.session, Race, race, skipcolumns=['id'], 
                             club_id=self.club_id, name=course.name, year=ftime.asc2dt(course.date).year, fixeddist=race_fixeddist(course.distmiles))
        # else:
        #     app.logger.debug('race.locationid={} course.location="{}"'.format(race.locationid, course.location))

        # debug races
        if self.racefile:
            racestatusl = []
            if not coursecached: racestatusl.append('addcourse')
            if not racecached: racestatusl.append('addrace')
            if not racestatusl: racestatusl.append('cached')
            racestatus = '-'.join(racestatusl)
            racerow = {'status': racestatus, 'runner': self.name}

            for racefield in self.racefields:
                if racefield in ['status', 'runner']: continue
                racerow[racefield] = getattr(course,racefield)
            self.RACE.writerow(racerow)


        # fill in output record fields from result, course
        # combine name, get age
        outrec['age'] = result['Age']
        outrec['fuzzyage'] = result['fuzzyage']

        # leave athlid blank if result not from an athlink member
        athlmember = result['IsMember']
        if athlmember:
            outrec['athlid'] = result['RacerID']

        # remember the entryid, high water mark of which can be used to limit the work here
        outrec['entryid'] = result['EntryID']

        # race name, location; convert from unicode if necessary
        # TODO: make function to do unicode translation -- apply to runner name as well (or should csv just store unicode?)
        outrec['race'] = course.name
        outrec['date'] = course.date
        outrec['loc'] = course.location
        
        outrec['miles'] = course.distmiles
        outrec['km'] = course.distkm
        outrec['category'] = course.surface
        resulttime = result['TicksString']

        # strange case of TicksString = ':00'
        if resulttime[0] == ':':
            resulttime = '0'+resulttime
        while resulttime.count(':') < 2:
            resulttime = '0:'+resulttime
        outrec['time'] = resulttime
        
        # strange case of 0 time, causes ZeroDivisionError and is clearly not valid
        if timeu.timesecs(resulttime) == 0: return None

        # leave out age grade if exception occurs, skip results which have outliers
        try:
            # skip result if runner's age doesn't match the age within the result
            # sometimes athlinks stores the age group of the runner, not exact age,
            # so also check if this runner's age is within the age group, and indicate if so
            e_racedate = athlinks.gettime(result['Race']['RaceDate'])
            resultgen = result['Gender'][0]
            dt_racedate = timeu.epoch2dt(e_racedate)
            racedateage = timeu.age(dt_racedate,self.dt_dob)
            agpercent,agresult,agfactor = ag.agegrade(racedateage,resultgen,course.distmiles,timeu.timesecs(resulttime))
            outrec['ag'] = agpercent
            if agpercent < 15 or agpercent >= 100: return None # skip obvious outliers
        except:
            app.logger.warning(traceback.format_exc())
            pass

        # and we're done
        return outrec

    #----------------------------------------------------------------------
    def closeservice(self):
    #----------------------------------------------------------------------
        '''
        closes service, if necessary

        may be overridden when ResultsCollect is instantiated
        '''
        if self.racefile:
            self._RACE.close()