def get_chronotrack_results(events=pd.DataFrame({ "event_name": [], "chronotrack_id": [], "date": []}), race_overrides=pd.DataFrame({ "race_name": ['2K Para Nordic Cup', '5K Hauska Heikki Ski'], "discipline": ['sitski', 'freestyle'], "distance": [2.0, 5.0] }) ): total_results = [] for index, event in events.iterrows(): results = scrape_chronotrack_results(event.chronotrack_id) results['distance'] = [extract_distance_from_race_name(n) for n in results.race_name] results['discipline'] = [extract_discipline_from_race_name(n) for n in results.race_name] results['event_name'] = event.event_name results['date'] = event.date results['duration'] = [parse_time_millis(t) for t in results.time] results = attach_placements(results) total_results.append(results) results_df = pd.concat(total_results) results_df = results_df[~pd.isnull(results_df.discipline)] results_with_overrides = results_df.merge(race_overrides, how='left', on=['race_name'], suffixes=('', '_override')) results_with_overrides['distance'] = np.where(pd.isnull(results_with_overrides.distance), results_with_overrides.distance_override, results_with_overrides.distance) results_with_overrides['discipline'] = np.where(pd.isnull(results_with_overrides.discipline), results_with_overrides.discipline_override, results_with_overrides.discipline) return results_with_overrides[['overall_place', 'gender_place', 'name', 'location', 'time', 'gender', 'distance', 'discipline', 'event_name', 'date', 'age']]
def get_myraceresults_results(events=pd.DataFrame({ "event_name": [], "url": [], "date": []})): total_results = [] for index, event in events.iterrows(): races = mrrs.get_mrr_races(event.url) for contest_number, list_name, race_name, event_id, event_key in races: results, column_names = mrrs.get_mrr_results(event_id, event_key, list_name, contest_number, race_name = race_name) results_df = pd.DataFrame(results, columns = ['name', 'location', 'age_group', 'time', 'race_name']) results_df['gender'] = np.where(results_df.age_group.str.startswith('M'), 'male', 'female') results_df['discipline'] = [extract_discipline_from_race_name(n) for n in results_df.race_name] # ignore undetectable disciplines and non-ski discplines results_df = results_df[~pd.isnull(results_df.discipline)] results_df['distance'] = [extract_distance_from_race_name(n) for n in results_df.race_name] results_df['duration'] = [parse_time_millis(t) for t in results_df.time] # ignore DNFs and borked time formats results_df = results_df[~pd.isnull(results_df.duration)] results_df['date'] = event.date results_df['event_name'] = event.event_name results_df = attach_placements(results_df) total_results.append(results_df) return pd.concat(total_results)[['overall_place', 'gender_place', 'name', 'location', 'time', 'gender', 'distance', 'discipline', 'event_name', 'date']]
def get_myraceresults_results(events=pd.DataFrame({ "event_name": ['Vasaloppet USA', 'Noquemanon Ski Marathon', 'Pepsi Challenge', 'Great Bear Chase'], "url": ['https://my3.raceresult.com/117060/', 'https://my5.raceresult.com/115565/', 'https://my2.raceresult.com/118903/', 'https://my1.raceresult.com/118905/'], "date": ['2019-02-09', '2019-01-26', '2019-03-02', '2019-03-09']})): total_results = [] for index, event in events.iterrows(): races = mrrs.get_mrr_races(event.url) for contest_number, list_name, race_name, event_id, event_key in races: results, column_names = mrrs.get_mrr_results(event_id, event_key, list_name, contest_number, race_name = race_name) results_df = pd.DataFrame(results, columns = ['name', 'location', 'age_group', 'time', 'race_name']) results_df['gender'] = np.where(results_df.age_group.str.startswith('M'), 'male', 'female') results_df['discipline'] = [extract_discipline_from_race_name(n) for n in results_df.race_name] # ignore undetectable disciplines and non-ski discplines results_df = results_df[~pd.isnull(results_df.discipline)] results_df['distance'] = [extract_distance_from_race_name(n) for n in results_df.race_name] results_df['duration'] = [parse_time_millis(t) for t in results_df.time] # ignore DNFs and borked time formats results_df = results_df[~pd.isnull(results_df.duration)] results_df['date'] = event.date results_df['event_name'] = event.event_name results_df = attach_placements(results_df) total_results.append(results_df) return pd.concat(total_results)[['overall_place', 'gender_place', 'name', 'location', 'time', 'gender', 'distance', 'discipline', 'event_name', 'date']]
def get_gopher_state_results(event_names_to_distance = pd.DataFrame({'event_name': ['Turtle River Pursuit', 'Big Island and Back'], 'distance': [11, 7.5]})): events_2019 = [e for e in gss.get_events() if '2019' in e[1]] races_2019 = gss.get_races_from_events(events_2019) results_2019 = gss.get_results_from_races(races_2019) # remove bike and team results results_2019 = results_2019[~results_2019.race_name.str.contains('Bike') & ~results_2019.race_name.str.contains('Tm')] results_2019['date'] = [gss.extract_date_from_race_name(n) for n in results_2019.event_name] results_2019['discipline'] = [extract_discipline_from_race_name(n) for n in results_2019.race_name] results_2019 = results_2019[~pd.isnull(results_2019.discipline)] results_2019['event_name'] = [gss.extract_event(n) for n in results_2019.event_name] results_2019['name'] = results_2019.first_name + " " + results_2019.last_name results_2019['gender'] = np.where(results_2019.gender == 'M', 'male', 'female') results_2019['duration'] = [parse_time_millis(t) for t in results_2019.time] results_2019['location'] = None results = attach_placements(results_2019).merge(event_names_to_distance, how="inner", on=['event_name']) return results[['overall_place', 'gender_place', 'name', 'location', 'time', 'gender', 'distance', 'discipline', 'event_name', 'date']]
]] # I spot checked these are - they are cases of malformed names (e.g. last or first name only) all_results = all_results[~pd.isnull(all_results.name)] # a handful of yet misaligned columns - gross but too lazy to fix at source all_results['duration'] = np.where(all_results.duration.str.len() < 7, all_results.age, all_results.duration) # a few additional have borked ages all_results['age'] = np.where(all_results.age.str.len() <= 3, all_results.age, None) all_results.to_csv(STORAGE_DIRECTORY + 'pdf_birkie.csv') ########################## # with results (mostly) parsed out, we need to derive gender agnostic overall place ########################## all_results['time'] = all_results.duration all_results['duration'] = [parse_time_millis(t) for t in all_results.duration] def attach_placements(results): time_ordered_results = results.sort_values('duration') time_ordered_results['gender_place'] = time_ordered_results\ .groupby(['date', 'discipline', 'gender'])\ .cumcount() + 1 time_ordered_results['overall_place'] = time_ordered_results\ .groupby(['date', 'discipline'])\ .cumcount() + 1 return time_ordered_results
############################## # start control flow ############################## results = pd.concat([ get_gopher_state_results(), get_itiming_results(), get_chronotrack_results(), get_mtec_vasa_results(), get_mtec_mob_results(), get_mrr_results(), get_orr_results(), ]) results = results[~pd.isnull(results.time)] results['time_parsed'] = [parse_time_millis(t) for t in results.time] # note, this knocks off a handful of labelled DNFs & DQs results = results[~pd.isnull(results.time_parsed)] results['event_name_enumeration'] = [ enumerate_event_name(name) for name in results.event_name ] results['date'] = pd.to_datetime(results.date) results['distance'] = pd.to_numeric(results.distance / 1000) results = attach_placements(results) con = None try: con = get_connection() cursor = con.cursor()