import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas.api.types import CategoricalDtype from loadData import preProcess, filterColumn, filterMedalsOnly # load data hun_df = preProcess('./data/athlete_events.csv') medal_reversed = CategoricalDtype(categories=reversed( hun_df.Medal.cat.categories), ordered=True) hun_df['Medal'] = hun_df['Medal'].astype(medal_reversed) # filter dataframe to contain rows with medals only, then select Summer Games only filtered_df = filterMedalsOnly(hun_df) filtered_df = filterColumn(filtered_df, 'Season', 'Summer') # count medals in team events as one hun_df_no_duplicates = filtered_df.drop_duplicates(['Games', 'Event', 'Medal' ]).reset_index(drop=True) # cross tabulate Sex and Medal columns so that we get medal count by gender df2 = pd.crosstab(hun_df_no_duplicates['Sex'], hun_df_no_duplicates['Medal']) cols = ['#D4AF37', '#BCC6CC', '#cd7f32'] ax = df2.plot.barh(stacked=True, figsize=(8, 4), color=cols) handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels, bbox_to_anchor=(1.0, 1.0), frameon=False) plt.title('Medals won by Hungary according to gender', size=16,
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from loadData import preProcess, filterColumn # Load and preprocess data df = preProcess('./data/athlete_events.csv') # select summer games only df = filterColumn(df, 'Season', 'Summer') # select a subset of data and drop duplicate ID from the same year df = df[['Year', 'ID', 'Age', 'Sex']].drop_duplicates(['Year', 'ID']).reset_index(drop=True) # drop ID column df.drop('ID', axis=1, inplace=True) df2 = df.copy() df.set_index(['Year', 'Sex'], inplace=True, append=True) #group by year and sex columns, and calculate average age for each group df_grouped = df.groupby(level=['Year', 'Sex'])['Age'].mean() # Move 'Sex' level out of row index to columns index avg_age_vs_time = df_grouped.unstack() fig, ax = plt.subplots(figsize=(8, 6)) avg_age_vs_time.plot(ax=ax, linewidth=3) sns.scatterplot(x="Year", y="Age", data=df2, hue="Sex",
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from pandas.api.types import CategoricalDtype from loadData import preProcess, filterColumn # load data hun_df = preProcess('./data/athlete_events.csv') # select summer games only summer_df = filterColumn(hun_df, 'Season', 'Summer') fig, ax = plt.subplots(figsize=(14,6)) # create boxplot sns.boxplot(x="Year", y="Age", ax=ax, hue="Sex", palette={"Male": "#18a1cd", "Female":"#fa8c00"}, data=summer_df) ax.set_xlabel('Year', size=14, labelpad=10) ax.set_ylabel('Age (in years)', size=14) ax.set_title('Age distribution of Hungarian athletes in Summer Olympics', size=16, pad=20, weight='heavy') plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt from loadData import preProcess, filterColumn # Load and preprocess data hun_df = preProcess('./data/athlete_events.csv') # drop duplicate entries from the same Olympic Game hun_df_distinct_ids = hun_df.drop_duplicates(['Year', 'ID']).reset_index(drop=True) # select Summer Games only hun_df_distinct_ids = filterColumn(hun_df_distinct_ids, 'Season', 'Summer') colours = ['#18a1cd', '#fa8c00'] fig, ax = plt.subplots(figsize=(6, 4)) ax.pie(hun_df_distinct_ids['Sex'].value_counts(sort=False), colors=colours, startangle=60, wedgeprops={ 'linewidth': 0.5, 'edgecolor': 'lightgrey', 'width': 0.7 }, autopct=' %.0f%%', pctdistance=0.6, labeldistance=1.1) # Equal aspect ratio ensures that pie is drawn as a circle.