def to_code(name): code = to_code_3(name) if code is None: try: code = extra_codes[name] except KeyError: code = None return code
def _create_db(self): # **************************************************** # **************************************************** # Create all event instances if they don't exist already # **************************************************** # **************************************************** for event in ["VT", "UB", "BB", "FX"]: for junior in [True, False]: try: event_test = Event.objects.get(name=event, junior=junior) except Event.DoesNotExist: event_instance = Event(name=event, junior=junior) event_instance.save() # **************************************************** # **************************************************** # 2017 scores # **************************************************** # **************************************************** # ************************** # Read in The Gymternet's score spreadsheet # ************************** # Totals totals = pd.read_csv( "https://docs.google.com/spreadsheets/d/1fg3pFV1KGUCfUH7lHq_8UHdS0uH4O0yHHk4qtCV_QH4/export?gid=0&format=csv" ) totals.head() totals.columns = [ "gymnast", "country", "meet_name", "vt1", "ub", "bb", "fx", "aa", "vt_avg" ] totals.vt_avg = pd.to_numeric(totals.vt_avg, errors='coerce') # D scores dscore = pd.read_csv( "https://docs.google.com/spreadsheets/d/1fg3pFV1KGUCfUH7lHq_8UHdS0uH4O0yHHk4qtCV_QH4/export?gid=1144828878&format=csv" ) dscore.head() dscore.columns = [ "gymnast", "country", "meet_name", "vt1_d", "ub_d", "bb_d", "fx_d", "vt_total_d" ] # ************************** # Clean the scores data # ************************** # Get Vault 2 scores from Vault 1 and Vault Average totals["vt2"] = totals.vt_avg * 2 - totals.vt1 # Get Vault 2 d score from Vault 1 d score and total dscore["vt2_d"] = dscore.vt_total_d - dscore.vt1_d # Change some meet names for merging dscore.meet_name.replace("Brestyan's Qualifier", "Brestyan's National Qualifier", inplace=True) # Merge totals and d scores scores = pd.merge(totals, dscore, how="outer", on=["gymnast", "meet_name"], indicator=True) scores._merge.value_counts() # Check cases that didn't merge - where we have d scores but no totals print(scores.loc[scores._merge == "right_only", ["gymnast", "meet_name", "_merge"]]) print(scores.loc[scores._merge == "right_only", ].meet_name.value_counts()) # Delete these cases scores = scores.loc[scores._merge != "right_only", scores.columns[:-1]] # Drop country names from the D score sheet scores["country_x"] = np.where(scores.country_x == "", scores.country_y, scores.country_x) scores.drop(["country_y"], axis=1) scores = scores.rename(columns={'country_x': 'country'}) # Drop vault averages and totals scores.drop(["vt_total_d", "vt_avg"], axis=1) # Clean historic data scores["gymnast"] = scores.gymnast.str.replace("De Jesus dos Santos", "de Jesus dos Santos") scores["gymnast"] = scores.gymnast.str.replace("De Jesus Dos Santos", "de Jesus dos Santos") scores["gymnast"] = scores.gymnast.str.replace("Laurie Denommee", "Laurie Dénommée") # ************************** # Clean the meet type # ************************** scores["meet_day"] = "" day_types = ["QF", "TF", "AA", "EF"] for day in day_types: scores["meet_day"] = np.where(scores.meet_name.str.contains(day), day, scores.meet_day) scores.meet_day.value_counts() # Clean meet names to remove the type for day in day_types: scores["meet_name"] = scores.meet_name.str.replace(day, "") scores["meet_name"] = scores.meet_name.str.replace( "Champs", "Championships") scores["meet_name"] = scores.meet_name.str.replace( "FIT", "Flanders International Team") scores["meet_name"] = scores.meet_name.str.replace( "Euros", "European Championships") scores["meet_name"] = scores.meet_name.str.replace( "Euro Youth Olympic Festival", "European Youth Olympic Festival") scores["meet_name"] = scores.meet_name.str.replace( "Gymnix", "International Gymnix") scores["meet_name"] = scores.meet_name.str.replace( "Universiade", "Summer Universiade") scores["meet_name"] = scores.meet_name.str.replace( "Jesolo", "City of Jesolo Trophy") scores["meet_name"] = scores.meet_name.str.replace( "Top Gym", "Top Gym Tournament") scores["meet_name"] = scores.meet_name.str.replace( "DTB Pokal", "DTB Pokal Team Challenge") scores["meet_name"] = scores.meet_name.str.replace( "Gymnova", "Gymnova Cup") scores["meet_name"] = scores.meet_name.str.replace( "Unni & Haralds", "Unni & Haralds Trophy") scores["meet_name"] = scores.meet_name.str.replace( "Hungarian Masters", "Hungarian Master Championships") scores["meet_name"] = scores.meet_name.str.replace( "Austrian Open", "Austrian Team Open") scores["meet_name"] = scores.meet_name.str.replace( "2nd Norwegian FIG", "2nd Norwegian FIG Meet") scores["meet_name"] = scores.meet_name.str.replace( "Brestyan's National Qualifier", "Brestyan’s National Qualifier") scores["meet_name"] = scores.meet_name.str.replace( "Mediterranean", "Mediterranean Junior Championships") scores["meet_name"] = scores.meet_name.str.replace( "Stella Zakharova", "Stella Zakharova Cup") scores["meet_name"] = scores.meet_name.str.replace( "Pan Am Championships", "Pan American Championships") scores["meet_name"] = scores.meet_name.str.replace( "Reykjavik International", "Reykjavik International Games") scores["meet_name"] = scores.meet_name.str.replace( "Dutch Invitational", "Dutch Women’s Invitational") scores["meet_name"] = scores.meet_name.str.replace( "Junior Japan", "Junior Japan International") for city in ["Melbourne", "Baku", "Cottbus", "Doha"]: scores["meet_name"] = scores.meet_name.str.replace( city, city + " World Cup") for city in ["Paris", "Osijek", "Koper", "Szombathely", "Varna"]: scores["meet_name"] = scores.meet_name.str.replace( city, city + " Challenge Cup") for meet in ["South American Junior", "France Top 12"]: scores["meet_name"] = scores.meet_name.str.replace( meet, meet + " Championships") # ************************** # Mark juniors # ************************** scores["junior2017"] = False scores["junior2017"] = np.where(scores.gymnast.str.contains("\*"), True, scores.junior2017) scores["gymnast"] = scores.gymnast.str.replace("\*", "") # ************************** # Get meet start and end dates # ************************** # Download the HTML from TheGymternet's list of meets response = get("https://thegymter.net/2017-gymnastics-calendar/") soup = BeautifulSoup(response.text, 'html.parser') meets = soup.find("table").findAll("tr") # Set up arrays to store the meet data start_date = [] end_date = [] meet_name = [] meet_loc = [] # Definte a regular expression to get alphabetic characters from a string - we will use this to spearate months from days regex = re.compile('[^a-zA-Z]') # Loop through the meets (skipping the first row which has headings) for meet in meets[1:]: # Clean start and end date date = meet.findAll("td")[0].text date = date.split("-") start_date.append(date[0] + " 2017") # Cases where the meet is only one day if len(date) == 1: end_date.append(date[0] + " 2017") # Cases where the meet is many days, but the dates are in the same month elif regex.sub('', date[1]) == "": month = regex.sub('', date[0]) end_date.append(month + " " + date[1] + " 2017") # Cases where the meet is many days, but they dates are in different months else: end_date.append(date[1] + " 2017") # Clean month formats # Pull meet name meet_name.append(meet.findAll("td")[1].find("a").contents[0]) # Pull meet location meet_loc_try = meet.findAll("td")[1].text.split(",", maxsplit=1) if len(meet_loc_try) > 1: meet_loc.append(meet_loc_try[1]) else: meet_loc.append("") # Combine results in data frame meets = pd.DataFrame({ 'meet_name': meet_name, 'start_date': start_date, 'end_date': end_date, 'meet_loc': meet_loc }) # Clean some dates meets.start_date = meets.start_date.str.replace("June", "Jun") meets.start_date = meets.start_date.str.replace("July", "Jul") meets.end_date = meets.end_date.str.replace("July", "Jul") meets.end_date = meets.end_date.str.replace("June", "Jun") # Merge in the meets scores["meet_name"] = scores.meet_name.str.strip() meets["meet_name"] = meets.meet_name.str.strip() scores = pd.merge(scores, meets, how="outer", on=["meet_name"], indicator=True) print(scores._merge.value_counts()) # Check cases that didn't merge - not that many. Fine for now. scores = scores.loc[scores._merge != "right_only", scores.columns[:-1]] scores['meet_name'] = scores['meet_name'].astype(str) + " (2017)" # ************************** # Load countries in # ************************** # Clean some countries with typoes scores.country.replace("Chia", "China", inplace=True) # Load countries in countries_df = scores.country.drop_duplicates() for country in countries_df: try: country_test = Country.objects.get(name=country) except Country.DoesNotExist: if country != "Chinese Taipei" and country != "Taiwan": country_instance = Country( name=country, iso3c=countrynames.to_code_3(country)) country_instance.save() else: try: country_test = Country.objects.get(name="Taiwan") except Country.DoesNotExist: country_instance = Country(name="Taiwan", iso3c="TWN") country_instance.save() # ************************** # Load meets in # ************************** meets_df = scores[["meet_name", "start_date", "end_date", "meet_loc"]].drop_duplicates() meets_df["start_date_fmt"] = pd.to_datetime(meets_df.start_date, format="%b %d %Y") meets_df["end_date_fmt"] = pd.to_datetime(meets_df.end_date, format="%b %d %Y") for meet in meets_df.itertuples(): try: meet_test = Meet.objects.get(name=meet.meet_name) except Meet.DoesNotExist: if pd.isnull(meet.start_date_fmt) == False: meet_instance = Meet(name=meet.meet_name, start_date=meet.start_date_fmt, end_date=meet.end_date_fmt) else: meet_instance = Meet(name=meet.meet_name) meet_instance.save() print(meet_instance) # ************************** # Load gymnasts in # ************************** gymnasts_df = scores[["gymnast", "country"]].drop_duplicates() gymnasts_df.country.replace("Chinese Taipei", "Taiwan", inplace=True) for person in gymnasts_df.itertuples(): try: gymnast_test = Gymnast.objects.get(name=person.gymnast) except Gymnast.DoesNotExist: gymnast_instance = Gymnast( name=person.gymnast, country=Country.objects.get(name=person.country)) gymnast_instance.save() # ************************** # Load scores in # ************************** for row in scores.itertuples(): score_test = Score.objects.filter( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day) if score_test.count() == 0: # Vault 1 if pd.isnull(row.vt1) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2017), score=row.vt1, d_score=row.vt1_d, score_num=1) score_instance.save() # Bars if pd.isnull(row.ub) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="UB", junior=row.junior2017), score=row.ub, d_score=row.ub_d, score_num=1) score_instance.save() # Balance beam if pd.isnull(row.bb) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="BB", junior=row.junior2017), score=row.bb, d_score=row.bb_d, score_num=1) score_instance.save() # Floor if pd.isnull(row.fx) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="FX", junior=row.junior2017), score=row.fx, d_score=row.fx_d, score_num=1) score_instance.save() # Vault 2 if pd.isnull(row.vt2) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2017), score=row.vt2, d_score=row.vt2_d, score_num=2) score_instance.save() # ************************** # Add dates for some meets without dates # ************************** for meet_name in [ "Elite Gym Massilia Masters (2017)", "Elite Gym Massilia Open (2017)", "Elite Gym Massila Espoir (2017)" ]: meet = Meet.objects.get(name=meet_name) meet.start_date = datetime.date(2017, 11, 17) meet.end_date = datetime.date(2017, 11, 19) meet.save() meet = Meet.objects.get( name="Czech European Championships Test (2017)") meet.start_date = datetime.date(2017, 3, 18) meet.save() meet = Meet.objects.get(name="Brazilian Selection (2017)") meet.start_date = datetime.date(2017, 7, 22) meet.end_date = datetime.date(2017, 7, 24) meet.save() meet = Meet.objects.get(name="Stuttgart World Cup (2017)") meet.start_date = datetime.date(2017, 3, 18) meet.end_date = datetime.date(2017, 3, 19) meet.save() meet = Meet.objects.get(name="France Top 12 Championships (2017)") meet.start_date = datetime.date(2017, 3, 11) meet.end_date = datetime.date(2017, 3, 12) meet.save() meet = Meet.objects.get(name="German Junior Friendly (2017)") meet.start_date = datetime.date(2017, 7, 8) meet.save() # **************************************************** # **************************************************** # 2018 scores # **************************************************** # **************************************************** # ************************** # Read in The Gymternet's score spreadsheet # ************************** # Totals totals = pd.read_csv( "https://docs.google.com/spreadsheets/d/1HI0tOSgjIS8rFjbTwCTlhG1LxP6sttzDMN3-4u0B0u4/export?gid=0&format=csv" ) totals.head() totals.columns = [ "gymnast", "country", "meet_name", "vt1", "ub", "bb", "fx", "aa", "vt_avg" ] # D scores dscore = pd.read_csv( "https://docs.google.com/spreadsheets/d/1HI0tOSgjIS8rFjbTwCTlhG1LxP6sttzDMN3-4u0B0u4/export?gid=1212101599&format=csv" ) dscore.head() #dscore.drop(dscore.columns[len(dscore.columns)-1], axis=1, inplace=True) dscore.columns = [ "gymnast", "country", "meet_name", "vt1_d", "ub_d", "bb_d", "fx_d", "vt_total_d" ] # ************************** # Clean the scores data # ************************** # Get Vault 2 scores from Vault 1 and Vault Average totals["vt2"] = totals.vt_avg * 2 - totals.vt1 # Get Vault 2 d score from Vault 1 d score and total dscore["vt2_d"] = dscore.vt_total_d - dscore.vt1_d # Merge totals and d scores scores = pd.merge(totals, dscore, how="outer", on=["gymnast", "meet_name"], indicator=True) scores._merge.value_counts() # Check cases that didn't merge - where we have d scores but no totals scores.loc[scores._merge == "right_only", ["gymnast", "meet_name", "_merge"]] scores.loc[scores._merge == "right_only", ].meet_name.value_counts() # Delete these cases scores = scores.loc[scores._merge != "right_only", scores.columns[:-1]] # Drop country names from the D score sheet scores["country_x"] = np.where(scores.country_x == "", scores.country_y, scores.country_x) scores.drop(["country_y"], axis=1) scores = scores.rename(columns={'country_x': 'country'}) # Drop vault averages and totals scores.drop(["vt_total_d", "vt_avg"], axis=1) # Clean some typos try: scores["ub_d"] = scores.ub_d.str.replace(".4.3", "4.3") except: print("I guess the score typo was fixed...") scores["gymnast"] = scores.gymnast.str.replace("De Jesus dos Santos", "de Jesus dos Santos") scores["gymnast"] = scores.gymnast.str.replace("De Jesus Dos Santos", "de Jesus dos Santos") scores["gymnast"] = scores.gymnast.str.replace("Laurie Denommee", "Laurie Dénommée") # ************************** # Clean the meet type # ************************** scores["meet_day"] = "" day_types = ["QF", "TF", "AA", "EF"] for day in day_types: scores["meet_day"] = np.where(scores.meet_name.str.contains(day), day, scores.meet_day) scores.meet_day.value_counts() # Clean meet names to remove the type for day in day_types: scores["meet_name"] = scores.meet_name.str.replace(day, "") # ************************** # Mark juniors # ************************** scores["junior2018"] = False scores["junior2018"] = np.where(scores.gymnast.str.contains("\*"), True, scores.junior2018) scores["gymnast"] = scores.gymnast.str.replace("\*", "") # ************************** # Get meet start and end dates # ************************** # Download the HTML from TheGymternet's list of meets response = get("https://thegymter.net/2018-gymnastics-calendar/") soup = BeautifulSoup(response.text, 'html.parser') meets = soup.find("table").findAll("tr") # Set up arrays to store the meet data start_date = [] end_date = [] meet_name = [] meet_loc = [] # Definte a regular expression to get alphabetic characters from a string - we will use this to spearate months from days regex = re.compile('[^a-zA-Z]') # Loop through the meets (skipping the first row which has headings) for meet in meets: # Clean start and end date date = meet.findAll("td")[0].text date = date.split("-") start_date.append(date[0] + " 2018") # Cases where the meet is only one day if len(date) == 1: end_date.append(date[0] + " 2018") # Cases where the meet is many days, but the dates are in the same month elif regex.sub('', date[1]) == "": month = regex.sub('', date[0]) end_date.append(month + " " + date[1] + " 2018") # Cases where the meet is many days, but they dates are in different months else: end_date.append(date[1] + " 2018") # Pull meet name meet_name.append(meet.findAll("td")[1].text) # Pull meet location meet_loc.append(meet.findAll("td")[2].text) # Combine results in data frame meets = pd.DataFrame({ 'meet_name': meet_name, 'start_date': start_date, 'end_date': end_date, 'meet_loc': meet_loc }) # Drop MAG meets meets = meets.loc[~meets.meet_name.str.contains("MAG"), :] # Merge in the meets scores["meet_name"] = scores.meet_name.str.strip() meets["meet_name"] = meets.meet_name.str.strip() scores = pd.merge(scores, meets, how="outer", on=["meet_name"], indicator=True) scores._merge.value_counts() # Check cases that didn't merge - not that many. Fine for now. scores = scores.loc[scores._merge != "right_only", scores.columns[:-1]] # Add the year to the meet name (because some meets occur every year) scores['meet_name'] = scores['meet_name'].astype(str) + " (2018)" # ************************** # Load countries in # ************************** countries_df = scores.country.drop_duplicates() for country in countries_df: try: country_test = Country.objects.get(name=country) except Country.DoesNotExist: if country != "Chinese Taipei" and country != "Taiwan": country_instance = Country( name=country, iso3c=countrynames.to_code_3(country)) country_instance.save() else: try: country_test = Country.objects.get(name="Taiwan") except Country.DoesNotExist: country_instance = Country(name="Taiwan", iso3c="TWN") country_instance.save() # ************************** # Load meets in # ************************** meets_df = scores[["meet_name", "start_date", "end_date", "meet_loc"]].drop_duplicates() meets_df["start_date_fmt"] = pd.to_datetime(meets_df.start_date, format="%b %d %Y") meets_df["end_date_fmt"] = pd.to_datetime(meets_df.end_date, format="%b %d %Y") for meet in meets_df.itertuples(): try: meet_test = Meet.objects.get(name=meet.meet_name) except Meet.DoesNotExist: if pd.isnull(meet.start_date_fmt) == False: meet_instance = Meet(name=meet.meet_name, start_date=meet.start_date_fmt, end_date=meet.end_date_fmt) else: meet_instance = Meet(name=meet.meet_name) meet_instance.save() print(meet_instance) # ************************** # Load gymnasts in # ************************** gymnasts_df = scores[["gymnast", "country"]].drop_duplicates() gymnasts_df.country.replace("Chinese Taipei", "Taiwan", inplace=True) for person in gymnasts_df.itertuples(): try: gymnast_test = Gymnast.objects.get(name=person.gymnast) except Gymnast.DoesNotExist: gymnast_instance = Gymnast( name=person.gymnast, country=Country.objects.get(name=person.country)) gymnast_instance.save() # ************************** # Load scores in # ************************** for row in scores.itertuples(): score_test = Score.objects.filter( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day) if score_test.count() == 0: # Vault 1 if pd.isnull(row.vt1) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2018), score=row.vt1, d_score=row.vt1_d, score_num=1) score_instance.save() # Bars if pd.isnull(row.ub) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="UB", junior=row.junior2018), score=row.ub, d_score=row.ub_d, score_num=1) score_instance.save() # Balance beam if pd.isnull(row.bb) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="BB", junior=row.junior2018), score=row.bb, d_score=row.bb_d, score_num=1) score_instance.save() # Floor if pd.isnull(row.fx) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="FX", junior=row.junior2018), score=row.fx, d_score=row.fx_d, score_num=1) score_instance.save() # Vault 2 if pd.isnull(row.vt2) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2018), score=row.vt2, d_score=row.vt2_d, score_num=2) score_instance.save() # ************************** # Add dates for some meets without dates # ************************** meet = Meet.objects.get(name="U.S. Verification (April) (2018)") meet.start_date = datetime.date(2018, 4, 8) meet.end_date = datetime.date(2018, 4, 8) meet.save() meet = Meet.objects.get(name="Top 12 Final (2018)") meet.start_date = datetime.date(2018, 3, 17) meet.end_date = datetime.date(2018, 3, 17) meet.save() meet = Meet.objects.get(name="Brestyan's National Qualifier (2018)") meet.start_date = datetime.date(2018, 6, 23) meet.end_date = datetime.date(2018, 6, 24) meet.save() meet = Meet.objects.get(name="Desert Lights Qualifier (2018)") meet.start_date = datetime.date(2018, 1, 27) meet.end_date = datetime.date(2018, 1, 28) meet.save() meet = Meet.objects.get(name="Orlando Qualifier (2018)") meet.start_date = datetime.date(2018, 2, 9) meet.end_date = datetime.date(2018, 2, 11) meet.save() meet = Meet.objects.get(name="President's Cup (2018)") meet.start_date = datetime.date(2018, 2, 12) meet.end_date = datetime.date(2018, 2, 16) meet.save() meet = Meet.objects.get(name="Klaverblad Championships (2018)") meet.start_date = datetime.date(2018, 6, 9) meet.end_date = datetime.date(2018, 6, 10) meet.save() meet = Meet.objects.get(name="Buckeye Qualifier (2018)") meet.start_date = datetime.date(2018, 2, 1) meet.end_date = datetime.date(2018, 2, 2) meet.save() meet = Meet.objects.get(name="Swiss Duel (2018)") meet.start_date = datetime.date(2018, 9, 23) meet.end_date = datetime.date(2018, 9, 23) meet.save() meet = Meet.objects.get(name="German Worlds Trial (2018)") meet.start_date = datetime.date(2018, 9, 15) meet.end_date = datetime.date(2018, 9, 15) meet.save() # **************************************************** # **************************************************** # 2019 scores # **************************************************** # **************************************************** # ************************** # Read in The Gymternet's score spreadsheet # ************************** # Totals scores = pd.read_csv( "https://docs.google.com/spreadsheets/d/1213cgQJaKzzpwoO46m5ihT7F6poyhAzimpsu7VEgTWA/export?gid=1358682386&format=csv" ) scores.head() scores.columns = [ "gymnast", "country", "meet_name", "meet_day", "vt1", "vt2", "ub", "bb", "fx", "vt1_d", "vt2_d", "ub_d", "bb_d", "fx_d", "meet_loc", "start_date", "end_date", "junior2019" ] # ************************** # Clean the scores data # ************************** # ************************** # Clean the meet type # ************************** # ************************** # Mark juniors # ************************** scores["junior2019"] = (scores["junior2019"] == True) # ************************** # Get meet start and end dates # ************************** # Add the year to the meet name (because some meets occur every year) scores['meet_name'] = scores['meet_name'].astype(str) + " (2019)" # ************************** # Load countries in # ************************** countries_df = scores.country.drop_duplicates() for country in countries_df: try: country_test = Country.objects.get(name=country) except Country.DoesNotExist: if country != "Chinese Taipei" and country != "Taiwan": country_instance = Country( name=country, iso3c=countrynames.to_code_3(country)) country_instance.save() else: try: country_test = Country.objects.get(name="Taiwan") except Country.DoesNotExist: country_instance = Country(name="Taiwan", iso3c="TWN") country_instance.save() # ************************** # Load meets in # ************************** meets_df = scores[["meet_name", "start_date", "end_date", "meet_loc"]].drop_duplicates() meets_df["start_date_fmt"] = pd.to_datetime(meets_df.start_date, format="%b %d %Y") meets_df["end_date_fmt"] = pd.to_datetime(meets_df.end_date, format="%b %d %Y") for meet in meets_df.itertuples(): try: meet_test = Meet.objects.get(name=meet.meet_name) except Meet.DoesNotExist: if pd.isnull(meet.start_date_fmt) == False: meet_instance = Meet(name=meet.meet_name, start_date=meet.start_date_fmt, end_date=meet.end_date_fmt) else: meet_instance = Meet(name=meet.meet_name) meet_instance.save() print(meet_instance) # ************************** # Load gymnasts in # ************************** gymnasts_df = scores[["gymnast", "country"]].drop_duplicates() gymnasts_df.country.replace("Chinese Taipei", "Taiwan", inplace=True) for person in gymnasts_df.itertuples(): try: gymnast_test = Gymnast.objects.get(name=person.gymnast) except Gymnast.DoesNotExist: gymnast_instance = Gymnast( name=person.gymnast, country=Country.objects.get(name=person.country)) gymnast_instance.save() # ************************** # Load scores in # ************************** for row in scores.itertuples(): score_test = Score.objects.filter( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day) if score_test.count() == 0: # Vault 1 if pd.isnull(row.vt1) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2019), score=row.vt1, d_score=row.vt1_d, score_num=1) score_instance.save() # Bars if pd.isnull(row.ub) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="UB", junior=row.junior2019), score=row.ub, d_score=row.ub_d, score_num=1) score_instance.save() # Balance beam if pd.isnull(row.bb) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="BB", junior=row.junior2019), score=row.bb, d_score=row.bb_d, score_num=1) score_instance.save() # Floor if pd.isnull(row.fx) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="FX", junior=row.junior2019), score=row.fx, d_score=row.fx_d, score_num=1) score_instance.save() # Vault 2 if pd.isnull(row.vt2) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2019), score=row.vt2, d_score=row.vt2_d, score_num=2) score_instance.save() # **************************************************** # **************************************************** # 2020 scores # **************************************************** # **************************************************** # ************************** # Read in The Gymternet's score spreadsheet # ************************** # Totals #scores = pd.read_csv("https://docs.google.com/spreadsheets/d/1mAZlBhTIPOSZND4Z90ZmHJgobSqU8jv5dGpl54DHWSw/export?gid=0&format=csv") # used to need gid=0, now it causes 400 error scores = pd.read_csv( "https://docs.google.com/spreadsheets/d/1mAZlBhTIPOSZND4Z90ZmHJgobSqU8jv5dGpl54DHWSw/export?format=csv" ) scores.head() scores.columns = [ "gymnast", "country", "meet_name", "meet_day", "vt1", "vt2", "ub", "bb", "fx", "vt1_d", "vt2_d", "ub_d", "bb_d", "fx_d", "meet_loc", "start_date", "end_date", "junior2020" ] # ************************** # Clean the scores data # ************************** # ************************** # Clean the meet type # ************************** # ************************** # Mark juniors # ************************** scores["junior2020"] = (scores["junior2020"] == True) # ************************** # Get meet start and end dates # ************************** # Add the year to the meet name (because some meets occur every year) scores['meet_name'] = scores['meet_name'].astype(str) + " (2020)" # ************************** # Load countries in # ************************** countries_df = scores.country.drop_duplicates() for country in countries_df: try: country_test = Country.objects.get(name=country) except Country.DoesNotExist: if country != "Chinese Taipei" and country != "Taiwan": country_instance = Country( name=country, iso3c=countrynames.to_code_3(country)) country_instance.save() else: try: country_test = Country.objects.get(name="Taiwan") except Country.DoesNotExist: country_instance = Country(name="Taiwan", iso3c="TWN") country_instance.save() # ************************** # Load meets in # ************************** meets_df = scores[["meet_name", "start_date", "end_date", "meet_loc"]].drop_duplicates() meets_df["start_date_fmt"] = pd.to_datetime(meets_df.start_date, format="%b %d %Y") meets_df["end_date_fmt"] = pd.to_datetime(meets_df.end_date, format="%b %d %Y") for meet in meets_df.itertuples(): try: meet_test = Meet.objects.get(name=meet.meet_name) except Meet.DoesNotExist: if pd.isnull(meet.start_date_fmt) == False: meet_instance = Meet(name=meet.meet_name, start_date=meet.start_date_fmt, end_date=meet.end_date_fmt) else: meet_instance = Meet(name=meet.meet_name) meet_instance.save() print(meet_instance) # ************************** # Load gymnasts in # ************************** gymnasts_df = scores[["gymnast", "country"]].drop_duplicates() gymnasts_df.country.replace("Chinese Taipei", "Taiwan", inplace=True) for person in gymnasts_df.itertuples(): try: gymnast_test = Gymnast.objects.get(name=person.gymnast) except Gymnast.DoesNotExist: gymnast_instance = Gymnast( name=person.gymnast, country=Country.objects.get(name=person.country)) gymnast_instance.save() # ************************** # Load scores in # ************************** for row in scores.itertuples(): score_test = Score.objects.filter( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day) if score_test.count() == 0: # Vault 1 if pd.isnull(row.vt1) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2020), score=row.vt1, d_score=row.vt1_d, score_num=1) score_instance.save() # Bars if pd.isnull(row.ub) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="UB", junior=row.junior2020), score=row.ub, d_score=row.ub_d, score_num=1) score_instance.save() # Balance beam if pd.isnull(row.bb) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="BB", junior=row.junior2020), score=row.bb, d_score=row.bb_d, score_num=1) score_instance.save() # Floor if pd.isnull(row.fx) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="FX", junior=row.junior2020), score=row.fx, d_score=row.fx_d, score_num=1) score_instance.save() # Vault 2 if pd.isnull(row.vt2) == False: score_instance = Score( gymnast=Gymnast.objects.get(name=row.gymnast), meet=Meet.objects.get(name=row.meet_name), meet_day=row.meet_day, event=Event.objects.get(name="VT", junior=row.junior2020), score=row.vt2, d_score=row.vt2_d, score_num=2) score_instance.save()
emissions = emissions.rename(index=rename_eu_countries) emissions = emissions.drop("Total") # Rename Czechia status.Name = status.Name.replace( "Czech Republic", "Czechia") # Listed in the footnotes of the table for the purpose of Article 21. emissions.at["European Union", "Emissions"] = 4488404 emissions.at["European Union", "Percentage"] = 12.10 emissions.at["European Union", "Year"] = 2013 emissions["Name"] = emissions.index emissions.index = [to_code_3(item, fuzzy=True) for item in emissions.Name] emissions.index.name = "Code" # Names of parties not yet having signed. missing = pd.DataFrame(emissions.Name[~emissions.index.isin(status.index)]) status = status.append(missing, sort=False) export = status.join(emissions.iloc[:, :3], how="outer") export = export[["Name", "Signature", "Ratification", "Kind", "Date-Of-Effect", "Emissions", "Percentage", "Year"]] export = export.sort_values(by="Name") print("\nData summary:\n") print("Emissions sum w/o EU28: {:d} GgCO₂-equiv.".format(int( export.Emissions.sum() - export.Emissions.loc['EUU'].sum())))
def test_non_standard_codes(): assert to_code("European Union") == "EU" assert to_code_3("European Union") == "EUU" assert to_code("Kosovo") == "XK" assert to_code_3("Kosovo") == "XKX"
def test_to_code_3(): assert to_code_3("Germany") == "DEU" assert to_code_3("UK") == "GBR" assert to_code_3("Nothing") == None
from countrynames import to_code_3 emissions_transfers_csv = root / "data/emissions-transfers.csv" # Emissions transfers emissions_transfers = pd.read_excel( excel_national, sheet_name="Emissions Transfers GCB", skiprows=8, index_col=0 ) emissions_transfers.index.name = "Year" emissions_transfers = emissions_transfers.T emissions_transfers.index.rename("Name", inplace=True) emissions_transfers["Code"] = [to_code_3(i) or i for i in emissions_transfers.index] emissions_transfers = emissions_transfers.reset_index().drop( "Name", axis=1) emissions_transfers = pd.melt( emissions_transfers, id_vars="Code", var_name="Year", value_name="Emissions-Transfers" ) emissions_transfers.sort_values(["Code", "Year"], inplace=True) emissions_transfers.dropna(inplace=True)
def test_fuzzy_matching(): assert to_code("Rossiyskaya Federacia", fuzzy=True) == "RU" assert to_code("Falklands Islands", fuzzy=True) == "FK" assert to_code("TGermany", fuzzy=True) == "DE" assert to_code_3("State of Palestine", fuzzy=True) == "PSE"
from countrynames import to_code_3 emissions_transfers_csv = root / "data/emissions-transfers.csv" # Emissions transfers emissions_transfers = pd.read_excel( excel_national, sheet_name="Emissions Transfers", skiprows=8, index_col=0 ) emissions_transfers.index.name = "Year" emissions_transfers = emissions_transfers.T emissions_transfers.index.rename("Name", inplace=True) emissions_transfers["Code"] = [to_code_3(i) or i for i in emissions_transfers.index] emissions_transfers = emissions_transfers.reset_index().drop( "Name", axis=1) emissions_transfers = pd.melt( emissions_transfers, id_vars="Code", var_name="Year", value_name="Emissions-Transfers" ) emissions_transfers.sort_values(["Code", "Year"], inplace=True) emissions_transfers.dropna(inplace=True)
territorial_gcb_csv = root / "data/territorial-emissions.csv" # Territorial GCB emissions territorial_gcb = pd.read_excel(excel_national, sheet_name="Territorial Emissions", skiprows=16, index_col=0, usecols="A:HW") territorial_gcb.index.name = "Year" territorial_gcb = territorial_gcb.T territorial_gcb.index.rename("Name", inplace=True) territorial_gcb["Code"] = [to_code_3(i) or i for i in territorial_gcb.index] territorial_gcb = territorial_gcb.reset_index().drop("Name", axis=1) territorial_gcb = pd.melt(territorial_gcb, id_vars='Code', var_name="Year", value_name="Emissions") territorial_gcb.sort_values(["Code", "Year"], inplace=True) territorial_gcb['Source'] = np.where(territorial_gcb.Year < 2017, "CDIAC", "BP") # The Global Carbon Budget doesn't list EU as having source UNFCCC and Monaco # is included with France (as in CDIAC)
territorial_cdiac_csv = root / "data/territorial-emissions-cdiac.csv" # Territorial CDIAC emissions territorial_cdiac = pd.read_excel(excel_national, sheet_name="Territorial Emissions CDIAC", skiprows=13, index_col=0) territorial_cdiac.index.name = "Year" territorial_cdiac = territorial_cdiac.T territorial_cdiac.index.rename("Name", inplace=True) territorial_cdiac["Code"] = [ to_code_3(i) or i for i in territorial_cdiac.index ] territorial_cdiac = territorial_cdiac.reset_index().drop("Name", axis=1) territorial_cdiac = pd.melt(territorial_cdiac, id_vars="Code", var_name="Year", value_name="Emissions") territorial_cdiac.dropna(inplace=True) territorial_cdiac.sort_values(["Code", "Year"], inplace=True) territorial_cdiac.to_csv(territorial_cdiac_csv, encoding="UTF-8",
def to_code(countryname): return to_code_3(countryname)
from util import root, excel_national from countrynames import to_code_3 consumption_emissions_csv = root / "data/consumption-emissions.csv" # Consumption emissions consumption_emissions = pd.read_excel(excel_national, sheet_name="Consumption Emissions", skiprows=8, index_col=0) consumption_emissions.index.name = "Year" consumption_emissions = consumption_emissions.T consumption_emissions.index.rename("Name", inplace=True) consumption_emissions["Code"] = [ to_code_3(i) or i for i in consumption_emissions.index ] consumption_emissions = consumption_emissions.reset_index().drop("Name", axis=1) consumption_emissions = pd.melt(consumption_emissions, id_vars="Code", var_name="Year", value_name="Consumption-Emissions") consumption_emissions.sort_values(["Code", "Year"], inplace=True) consumption_emissions.dropna(inplace=True) consumption_emissions.to_csv(consumption_emissions_csv,
from countrynames import to_code_3 consumption_emissions_csv = root / "data/consumption-emissions.csv" # Consumption emissions consumption_emissions = pd.read_excel( excel_national, sheet_name="Consumption Emissions", skiprows=8, index_col=0 ) consumption_emissions.index.name = "Year" consumption_emissions = consumption_emissions.T consumption_emissions.index.rename("Name", inplace=True) consumption_emissions["Code"] = [to_code_3(i) or i for i in consumption_emissions.index] consumption_emissions = consumption_emissions.reset_index().drop( "Name", axis=1) consumption_emissions = pd.melt( consumption_emissions, id_vars="Code", var_name="Year", value_name="Consumption-Emissions" ) consumption_emissions.sort_values(["Code", "Year"], inplace=True) consumption_emissions.dropna(inplace=True)
rename_eu_countries = {i: i[:-1] for i in emissions.index if i.endswith("*")} assert (len(rename_eu_countries) == 27) # UK already fixed above. emissions = emissions.rename(index=rename_eu_countries) emissions = emissions.drop("Total") # Rename Czechia status.Name = status.Name.replace("Czech Republic", "Czechia") # Listed in the footnotes of the table for the purpose of Article 21. emissions.at["European Union", "Emissions"] = 4488404 emissions.at["European Union", "Percentage"] = 12.10 emissions.at["European Union", "Year"] = 2013 emissions["Name"] = emissions.index emissions.index = [to_code_3(item, fuzzy=True) for item in emissions.Name] emissions.index.name = "Code" # Names of parties not yet having signed. missing = pd.DataFrame(emissions.Name[~emissions.index.isin(status.index)]) status = status.append(missing) export = status.join(emissions.iloc[:, :3], how="outer") export = export[[ "Name", "Signature", "Ratification", "Kind", "Date-Of-Effect", "Emissions", "Percentage", "Year" ]] export = export.sort_values(by="Name") print("\nData summary:\n") print("Emissions sum w/o EU28: {:d} GgCO₂-equiv.".format(
# Territorial GCB emissions territorial_gcb = pd.read_excel( excel_national, sheet_name="Territorial Emissions", skiprows=16, index_col=0, usecols="A:HV" ) territorial_gcb.index.name = "Year" territorial_gcb = territorial_gcb.T territorial_gcb.index.rename("Name", inplace=True) territorial_gcb["Code"] = [to_code_3(i) or i for i in territorial_gcb.index] territorial_gcb = territorial_gcb.reset_index().drop("Name", axis=1) territorial_gcb = pd.melt( territorial_gcb, id_vars='Code', var_name="Year", value_name="Emissions" ) territorial_gcb.sort_values(["Code", "Year"], inplace=True) territorial_gcb['Source'] = np.where( territorial_gcb.Year < 2015, "CDIAC", "BP")