def summarize(thistask, club_id, sources, status, summaryfile, detailfile, resultsurl, minage=12, minagegrade=20, minraces=3 , mintrend=2, numyears=3, begindate=None, enddate=None): #---------------------------------------------------------------------- ''' render collected results :param thistask: this is required for task thistask.update_state() :param club_id: identifies club for which results are to be stored :param sources: list of sources / services we're keeping status for :param summaryfile: summary file name (.csv) :param detailfile: detail file name (.csv) :param resultsurl: base url to send results to, for link in summary table :param minage: minimum age to keep track of stats :param minagegrade: minimum age grade :param minraces: minimum races in the same year as enddate :param mintrend: minimum races over the full period for trendline :param begindate: render races between begindate and enddate, datetime :param enddate: render races between begindate and enddate, datetime ''' # get club slug and location for later club = Club.query.filter_by(id=club_id).first() clubslug = club.shname locsvr = LocationServer() clublocation = locsvr.getlocation(club.location) # get maxdistance by service services = RaceResultService.query.filter_by(club_id=club_id).join(ApiCredentials).all() maxdistance = {} for service in services: attrs = ServiceAttributes(club_id, service.apicredentials.name) # app.logger.debug('service {} attrs {}'.format(service, attrs.__dict__)) if attrs.maxdistance: maxdistance[service.apicredentials.name] = attrs.maxdistance else: maxdistance[service.apicredentials.name] = None maxdistance[productname] = None # set up date range. begindate and enddate take precedence, else use numyears from today if not (begindate and enddate): etoday = time.time() today = timeu.epoch2dt(etoday) begindate = datetime(today.year-numyears+1,1,1) enddate = datetime(today.year,12,31) firstyear = begindate.year lastyear = enddate.year yearrange = range(firstyear,lastyear+1) # get all the requested result data from the database and save in a data structure indexed by runner ## first get the data from the database results = RaceResult.query.join(Race).join(Runner).filter(RaceResult.club_id==club_id, Race.date.between(ftime.dt2asc(begindate), ftime.dt2asc(enddate)), Runner.member==True, Runner.active==True).order_by(Runner.lname, Runner.fname).all() ## then set up our status and pass to the front end for source in sources: status[source]['status'] = 'summarizing' status[source]['lastname'] = '' status[source]['processed'] = 0 status[source]['total'] = sum([1 for result in results if result.source==source]) thistask.update_state(state='PROGRESS', meta={'progress':status}) ## prepare to save detail file, for debugging detlfields = 'runnername,runnerid,dob,gender,resultid,racename,racedate,series,distmiles,distkm,time,timesecs,agpercent,source,sourceid'.split(',') detailfname = detailfile _DETL = open(detailfname,'wb') DETL = csv.DictWriter(_DETL,detlfields) DETL.writeheader() ## then fill in data structure to hold AnalyzeAgeGrade objects ## use OrderedDict to force aag to be in same order as DETL file, for debugging aag = collections.OrderedDict() for result in results: # skip results which are too far away, if a maxdistance is defined for this source if maxdistance[result.source]: locationid = result.race.locationid if not locationid: continue racelocation = Location.query.filter_by(id=locationid).first() distance = get_distance(clublocation, racelocation) if distance == None or distance > maxdistance[result.source]: continue thisname = (result.runner.name.lower(), result.runner.dateofbirth) initaagrunner(aag, thisname, result.runner.fname, result.runner.lname, result.runner.gender, ftime.asc2dt(result.runner.dateofbirth), result.runner.id) # determine location name. any error gets null string locationname = '' if result.race.locationid: location = Location.query.filter_by(id=result.race.locationid).first() if location: locationname = location.name thisstat = aag[thisname].add_stat(ftime.asc2dt(result.race.date), result.race.distance*METERSPERMILE, result.time, race=result.race.name, loc=locationname, fuzzyage=result.fuzzyage, source=result.source, priority=priority[result.source]) ### TODO: store result's agpercent, in AgeGrade.crunch() skip agegrade calculation if already present DETL.writerow(dict( runnername = result.runner.name, runnerid = result.runner.id, dob = result.runner.dateofbirth, gender = result.runner.gender, resultid = result.id, racename = result.race.name, racedate = result.race.date, series = result.series.name if result.seriesid else None, distmiles = result.race.distance, distkm = result.race.distance*(METERSPERMILE/1000), timesecs = result.time, time = rendertime(result.time,0), agpercent = result.agpercent, source = result.source, sourceid = result.sourceid, )) ## close detail file _DETL.close() # initialize summary file summfields = ['name', 'lname', 'fname', 'age', 'gender'] datafields = copy(summfields) distcategories = ['overall'] + [TRENDLIMITS[tlimit][0] for tlimit in TRENDLIMITS] datacategories = ['overall'] + [TRENDLIMITS[tlimit][1] for tlimit in TRENDLIMITS] stattypes = ['1yr agegrade','avg agegrade','trend','numraces','stderr','r-squared','pvalue'] statdatatypes = ['1yr-agegrade','avg-agegrade','trend','numraces','stderr','r-squared','pvalue'] for stattype, statdatatype in zip(stattypes, statdatatypes): for distcategory, datacategory in zip(distcategories, datacategories): summfields.append('{}\n{}'.format(stattype, distcategory)) datafields.append('{}-{}'.format(statdatatype, datacategory)) if stattype == 'numraces': for year in yearrange: summfields.append('{}\n{}'.format(stattype, year)) datafields.append('{}-{}'.format(statdatatype, lastyear-year)) # save summary file columns for resultsanalysissummary dtcolumns = json.dumps([{ 'data':d, 'name':d, 'label':l } for d,l in zip(datafields, summfields)]) columnsfilename = summaryfile + '.cols' with open(columnsfilename, 'w') as cols: cols.write(dtcolumns) # set up summary file summaryfname = summaryfile _SUMM = open(summaryfname,'wb') SUMM = csv.DictWriter(_SUMM,summfields) SUMM.writeheader() # loop through each member we've recorded information about for thisname in aag: fullname, fname, lname, gender, dob, runnerid = aag[thisname].get_runner() rendername = fullname.title() # check stats before deduplicating statcount = {} stats = aag[thisname].get_stats() for source in sources: statcount[source] = sum([1 for s in stats if s.source == source]) # remove duplicate entries aag[thisname].deduplicate() # crunch the numbers aag[thisname].crunch() # calculate age grade for each result stats = aag[thisname].get_stats() jan1 = ftime.asc2dt('{}-1-1'.format(lastyear)) runnerage = timeu.age(jan1, dob) # filter out runners younger than allowed if runnerage < minage: continue # filter out runners who have not run enough races stats = aag[thisname].get_stats() if enddate: lastyear = enddate.year else: lastyear = timeu.epoch2dt(time.time()).year lastyearstats = [s for s in stats if s.date.year==lastyear] if len(lastyearstats) < minraces: continue # fill in row for summary output summout = {} # get link for this runner's results chart # see http://stackoverflow.com/questions/2506379/add-params-to-given-url-in-python url_parts = list(urlparse(resultsurl)) query = dict(parse_qsl(url_parts[4])) query.update({'club': clubslug, 'runnerid': runnerid, 'begindate': ftime.dt2asc(begindate), 'enddate': ftime.dt2asc(enddate)}) url_parts[4] = urlencode(query) resultslink = urlunparse(url_parts) summout['name'] = '<a href={} target=_blank>{}</a>'.format(resultslink, rendername) summout['fname'] = fname summout['lname'] = lname summout['age'] = runnerage summout['gender'] = gender # set up to collect averages avg = collections.OrderedDict() # draw trendlines, write output allstats = aag[thisname].get_stats() if len(allstats) > 0: avg['overall'] = mean([s.ag for s in allstats]) trend = aag[thisname].get_trendline() oneyrstats = [s.ag for s in allstats if s.date.year == lastyear] if len(oneyrstats) > 0: summout['1yr agegrade\noverall'] = mean(oneyrstats) if len(allstats) > 0: summout['avg agegrade\noverall'] = avg['overall'] if len(allstats) >= mintrend and allstats[0].date != allstats[-1].date: summout['trend\noverall'] = trend.improvement summout['stderr\noverall'] = trend.stderr summout['r-squared\noverall'] = trend.r2**2 summout['pvalue\noverall'] = trend.pvalue summout['numraces\noverall'] = len(allstats) for year in yearrange: summout['numraces\n{}'.format(year)] = len([s for s in allstats if s.date.year==year]) for tlimit in TRENDLIMITS: distcategory,distcolor = TRENDLIMITS[tlimit] tstats = [s for s in allstats if s.dist >= tlimit[0] and s.dist < tlimit[1]] if len(tstats) > 0: avg[distcategory] = mean([s.ag for s in tstats]) summout['avg agegrade\n{}'.format(distcategory)] = avg[distcategory] summout['numraces\n{}'.format(distcategory)] = len(tstats) oneyrcategory = [s.ag for s in tstats if s.date.year == lastyear] if len(oneyrcategory) > 0: summout['1yr agegrade\n{}'.format(distcategory)] = mean(oneyrcategory) if len(tstats) >= mintrend and tstats[0].date != tstats[-1].date: try: trend = aag[thisname].get_trendline(thesestats=tstats) except ZeroDivisionError: app.logger.debug('ZeroDivisionError - processing {}'.format(rendername)) trend = None # ignore trends which can't be calculated if trend: summout['trend\n{}'.format(distcategory)] = trend.improvement summout['stderr\n{}'.format(distcategory)] = trend.stderr summout['r-squared\n{}'.format(distcategory)] = trend.r2 summout['pvalue\n{}'.format(distcategory)] = trend.pvalue SUMM.writerow(summout) # update status for source in sources: status[source]['processed'] += statcount[source] status[source]['lastname'] = rendername thistask.update_state(state='PROGRESS', meta={'progress':status}) _SUMM.close()
class AthlinksCollect(CollectServiceResults): ######################################################################## #---------------------------------------------------------------------- def __init__(self): #---------------------------------------------------------------------- ''' initialize object instance may be overridden when ResultsCollect is instantiated, but overriding method must call `super(<subclass>, self).__init__(servicename, resultfilehdr, resultattrs)` ''' super(AthlinksCollect, self).__init__('athlinks', resultfilehdr, resultattrs) #---------------------------------------------------------------------- def openservice(self, club_id): #---------------------------------------------------------------------- ''' initialize service recommended that the overriding method save service instance in `self.service` must be overridden when ResultsCollect is instantiated :param club_id: club.id for club this service is operating on ''' # create location server self.locsvr = LocationServer() # remember club id we're working on self.club_id = club_id # debug file for races saved # set debugrace to False if not debugging debugrace = True if debugrace: clubslug = Club.query.filter_by(id=club_id).first().shname self.racefile = '{}/{}-athlinks-race.csv'.format(app.config['MEMBERSHIP_DIR'], clubslug) else: self.racefile = None if self.racefile: self._RACE = open(self.racefile, 'wb') self.racefields = 'id,name,date,distmiles,status,runner'.split(',') self.RACE = csv.DictWriter(self._RACE, self.racefields) self.RACE.writeheader() # open service key = ApiCredentials.query.filter_by(name=self.servicename).first().key self.service = athlinks.Athlinks(debug=True, key=key) #---------------------------------------------------------------------- def getresults(self, name, fname, lname, gender, dt_dob, begindate, enddate): #---------------------------------------------------------------------- ''' retrieves a list of results for a single name must be overridden when ResultsCollect is instantiated use dt_dob to filter errant race results, based on age of runner on race day :param name: name of participant for which results are to be returned :param fname: first name of participant :param lname: last name of participant :param gender: 'M' or 'F' :param dt_dob: participant's date of birth, as datetime :param begindate: epoch time for start of results, 00:00:00 on date to begin :param end: epoch time for end of results, 23:59:59 on date to finish :rtype: list of serviceresults, each of which can be processed by convertresult ''' # remember participant data self.name = name self.fname = fname self.lname = lname self.gender = gender self.dt_dob = dt_dob self.dob = ftime.dt2asc(dt_dob) # get results for this athlete allresults = self.service.listathleteresults(name) # filter by date and by age filteredresults = [] for result in allresults: e_racedate = athlinks.gettime(result['Race']['RaceDate']) # skip result if outside the desired time window if e_racedate < begindate or e_racedate > enddate: continue # skip result if wrong gender resultgen = result['Gender'][0] if resultgen != gender: continue # skip result if runner's age doesn't match the age within the result # sometimes athlinks stores the age group of the runner, not exact age, # so also check if this runner's age is within the age group, and indicate if so dt_racedate = timeu.epoch2dt(e_racedate) racedateage = timeu.age(dt_racedate,dt_dob) resultage = int(result['Age']) result['fuzzyage'] = False if resultage != racedateage: # if results are not stored as age group, skip this result if (resultage/5)*5 != resultage: continue # result's age might be age group, not exact age else: # if runner's age consistent with race age, use result, but mark "fuzzy" if (racedateage/5)*5 == resultage: result['fuzzyage'] = True # otherwise skip result else: continue # if we reach here, the result is ok, and is added to filteredresults filteredresults.append(result) # back to caller return filteredresults #---------------------------------------------------------------------- def convertserviceresult(self, result): #---------------------------------------------------------------------- ''' converts a single service result to dict suitable to be saved in resultfile result must be converted to dict with keys in `resultfilehdr` provided at instance creation must be overridden when ResultsCollect is instantiated use return value of None for cases when results could not be filtered by `:meth:getresults` :param fname: participant's first name :param lname: participant's last name :param result: single service result, from list retrieved through `getresults` :rtype: dict with keys matching `resultfilehdr`, or None if result is not to be saved ''' # create output record and copy common fields outrec = {} # copy participant information outrec['name'] = self.name outrec['GivenName'] = self.fname outrec['FamilyName'] = self.lname outrec['DOB'] = self.dob outrec['Gender'] = self.gender # some debug items - assume everything is cached coursecached = True racecached = True # get course used for this result courseid = '{}/{}'.format(result['Race']['RaceID'], result['CourseID']) course = Course.query.filter_by(club_id=self.club_id, source='athlinks', sourceid=courseid).first() # cache course if not done already race = None if not course: coursecached = False coursedata = self.service.getcourse(result['Race']['RaceID'], result['CourseID']) distmiles = athlinks.dist2miles(coursedata['Courses'][0]['DistUnit'],coursedata['Courses'][0]['DistTypeID']) distkm = athlinks.dist2km(coursedata['Courses'][0]['DistUnit'],coursedata['Courses'][0]['DistTypeID']) if distkm < 0.050: return None # skip timed events, which seem to be recorded with 0 distance # skip result if not Running or Trail Running race thiscategory = coursedata['Courses'][0]['RaceCatID'] if thiscategory not in race_category: return None course = Course() course.club_id = self.club_id course.source = 'athlinks' course.sourceid = courseid # strip racename and coursename here to make sure detail file matches what is stored in database racename = csvu.unicode2ascii(coursedata['RaceName']).strip() coursename = csvu.unicode2ascii(coursedata['Courses'][0]['CourseName']).strip() course.name = '{} / {}'.format(racename,coursename) # maybe truncate to FIRST part of race name if len(course.name) > MAX_RACENAME_LEN: course.name = course.name[:MAX_RACENAME_LEN] course.date = ftime.epoch2asc(athlinks.gettime(coursedata['RaceDate'])) course.location = csvu.unicode2ascii(coursedata['Home']) # maybe truncate to LAST part of location name, to keep most relevant information (state, country) if len(course.location) > MAX_LOCATION_LEN: course.location = course.location[-MAX_LOCATION_LEN:] # TODO: adjust marathon and half marathon distances? course.distkm =distkm course.distmiles = distmiles course.surface = race_category[thiscategory] # retrieve or add race # flush should allow subsequent query per http://stackoverflow.com/questions/4201455/sqlalchemy-whats-the-difference-between-flush-and-commit # Race has uniqueconstraint for club_id/name/year/fixeddist. It's been seen that there are additional races in athlinks, # but just assume the first is the correct one. raceyear = ftime.asc2dt(course.date).year race = Race.query.filter_by(club_id=self.club_id, name=course.name, year=raceyear, fixeddist=race_fixeddist(course.distmiles)).first() ### TODO: should the above be .all() then check for first race within epsilon distance? if not race: racecached = False race = Race(self.club_id, raceyear) race.name = course.name race.distance = course.distmiles race.fixeddist = race_fixeddist(race.distance) race.date = course.date race.active = True race.external = True race.surface = course.surface loc = self.locsvr.getlocation(course.location) race.locationid = loc.id db.session.add(race) db.session.flush() # force id to be created course.raceid = race.id db.session.add(course) db.session.flush() # force id to be created # maybe course was cached but location of race wasn't # update location of result race, if needed, and if supplied # this is here to clean up old database data if not race: race = Race.query.filter_by(club_id=self.club_id, name=course.name, year=ftime.asc2dt(course.date).year, fixeddist=race_fixeddist(course.distmiles)).first() if not race.locationid and course.location: # app.logger.debug('updating race with location {}'.format(course.location)) loc = self.locsvr.getlocation(course.location) race.locationid = loc.id insert_or_update(db.session, Race, race, skipcolumns=['id'], club_id=self.club_id, name=course.name, year=ftime.asc2dt(course.date).year, fixeddist=race_fixeddist(course.distmiles)) # else: # app.logger.debug('race.locationid={} course.location="{}"'.format(race.locationid, course.location)) # debug races if self.racefile: racestatusl = [] if not coursecached: racestatusl.append('addcourse') if not racecached: racestatusl.append('addrace') if not racestatusl: racestatusl.append('cached') racestatus = '-'.join(racestatusl) racerow = {'status': racestatus, 'runner': self.name} for racefield in self.racefields: if racefield in ['status', 'runner']: continue racerow[racefield] = getattr(course,racefield) self.RACE.writerow(racerow) # fill in output record fields from result, course # combine name, get age outrec['age'] = result['Age'] outrec['fuzzyage'] = result['fuzzyage'] # leave athlid blank if result not from an athlink member athlmember = result['IsMember'] if athlmember: outrec['athlid'] = result['RacerID'] # remember the entryid, high water mark of which can be used to limit the work here outrec['entryid'] = result['EntryID'] # race name, location; convert from unicode if necessary # TODO: make function to do unicode translation -- apply to runner name as well (or should csv just store unicode?) outrec['race'] = course.name outrec['date'] = course.date outrec['loc'] = course.location outrec['miles'] = course.distmiles outrec['km'] = course.distkm outrec['category'] = course.surface resulttime = result['TicksString'] # strange case of TicksString = ':00' if resulttime[0] == ':': resulttime = '0'+resulttime while resulttime.count(':') < 2: resulttime = '0:'+resulttime outrec['time'] = resulttime # strange case of 0 time, causes ZeroDivisionError and is clearly not valid if timeu.timesecs(resulttime) == 0: return None # leave out age grade if exception occurs, skip results which have outliers try: # skip result if runner's age doesn't match the age within the result # sometimes athlinks stores the age group of the runner, not exact age, # so also check if this runner's age is within the age group, and indicate if so e_racedate = athlinks.gettime(result['Race']['RaceDate']) resultgen = result['Gender'][0] dt_racedate = timeu.epoch2dt(e_racedate) racedateage = timeu.age(dt_racedate,self.dt_dob) agpercent,agresult,agfactor = ag.agegrade(racedateage,resultgen,course.distmiles,timeu.timesecs(resulttime)) outrec['ag'] = agpercent if agpercent < 15 or agpercent >= 100: return None # skip obvious outliers except: app.logger.warning(traceback.format_exc()) pass # and we're done return outrec #---------------------------------------------------------------------- def closeservice(self): #---------------------------------------------------------------------- ''' closes service, if necessary may be overridden when ResultsCollect is instantiated ''' if self.racefile: self._RACE.close()