def plot_gender_distributions_over_years(data, title='Gender distribution over the years for the different runnings of Lausanne Marathon'): ''' This function generates a graph representing gender distributions over the years, given a data set. Parameters - data: Array containing all DataFrames to consider (one by gender) - title: Title of the graph (by default, 'Gender distribution over the years for different runnings of Lausanne Marathon') Return - figure: Plotly figure ''' colors = {'10 km': KM_10_COLOR, 'Semi-marathon': KM_21_COLOR, 'Marathon': KM_42_COLOR} # We ignore outputs of Plotly with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=1, cols=2, subplot_titles=([(sex.capitalize() + ' runners') for sex in data])) i = 1 for sex, df in data.items(): bars = [] for running in df.columns: figure.append_trace(go.Bar(x=[year for year in df.index], y=df[running], name=running, marker={'color': colors[running]}, legendgroup=running, showlegend=(i == 1)), 1, i) i += 1 figure['layout'].update(title=title, barmode='stack') plotly.offline.iplot(figure) return figure
def plot_performance_comparison(data, age_category, year, performance_criterion, silent=False): ''' This function displays performance comparison graph given a set of data and a performance criterion. Parameters - data: Dict containing all the data used for performance comparison (see generate_performance_comparison) - age_category: string representing age category considered - year: year to compare with all other Lausanne Marathon editions - performance_criterion: Criterion to use for y axis - silent: If True, outputs are not displayed and figure is only returned ''' criterion = performance_criterion.lower() # Creation of figure (output is ignored) with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=1, cols=3, subplot_titles=([key for key, value in data[criterion].items()])) for index, boxplots in enumerate(data[criterion]): for boxplot in data[criterion][boxplots]: figure.append_trace(boxplot, 1, index+1) if criterion == 'time': for axis, attributes in {k: v for k, v in figure['layout'].items() if 'yaxis' in k}.items(): figure['layout'][axis].update(type='date', tickformat='%H:%M:%S') figure['layout']['yaxis1'].update(title=performance_criterion) figure['layout'].update(title='Performance comparison between Lausanne Marathon ' + str(year) + ' and all Lausanne Marathon<br>(Age category: ' + age_category +')') if not silent: plotly.offline.iplot(figure) return figure
def generate_performance_distribution_figure(data, age_category, sex_category, criterion): ''' This function generates performance distribution figure given a dataset. Parameters - data: DataFrame containing data to use - age_category: Selected age category - sex_category: Selected sex category - criterion: Selected criterion Return - figure: Plotly figure ''' # We define the considered runnings and the years interval runnings = OrderedDict([(10, {'name': '10 km', 'position': 1}), (21, {'name': 'Semi-marathon', 'position': 2}), (42, {'name': 'Marathon', 'position': 3})]) years_range = range(1999, 2017) years = {year: str(year) for year in years_range} colors = study_utils.generate_colors_palette(years) # We ignore outputs of Plotly with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=3, subplot_titles=([attributes['name'] for km, attributes in runnings.items()])) for km, attributes in runnings.items(): for year in years_range: data_filtered = data[(data['distance (km)'] == km) & (data['year'] == year)] if criterion == 'time': group_data_filtered = data_filtered.set_index('time').groupby(pd.TimeGrouper(freq='5Min')) x_values = [datetime.datetime.strptime(str(name), '%Y-%m-%d %H:%M:%S') for name, group in group_data_filtered] elif criterion == 'speed (m/s)': group_data_filtered = data_filtered.round({'speed (m/s)': 1}).groupby('speed (m/s)') x_values = [name for name, group in group_data_filtered] line = go.Scatter(mode='lines', x=x_values, y=[len(group) for name, group in group_data_filtered], name=str(year), legendgroup=str(year), marker={'color': colors[year]}, showlegend=(attributes['position'] == 1)) figure.append_trace(line, attributes['position'], 1) # Format of x axes if time is selected criterion if criterion == 'time': for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items(): figure['layout'][axis].update(type='date', tickformat='%H:%M') figure['layout'].update(width=1000, height=600, margin=go.Margin(t=100, b=50, l=50, r=50)) figure['layout'].update(title='Performance distribution of Lausanne Marathon editions for all runnings<br>(Age category: ' + age_category +' | Sex category: ' + sex_category +')') figure['layout']['legend'].update(y=0.5) return figure
def generate_evolution_and_performance_figure(data, criterion): ''' This function generates single evolution and performance figure given a set of data (see generate_evolution_and_performance_figures). Parameters - data: Data to be used for generation of figure - criterion: Performance criterion to be used Return - figure: Plotly figure representing evolution and performance in same graph ''' year_values = [year for year in range(1999, 2017)] year_labels = [v for v in year_values] colors = {'10 km': KM_10_COLOR, 'Semi-marathon': KM_21_COLOR, 'Marathon': KM_42_COLOR} # We ignore outputs of Plotly with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=4, specs=[[{'rowspan': 2}], [None], [{'rowspan': 2}], [None]], subplot_titles=('Evolution of runners over the years', 'Performance comparison by years')) # Add all lines in first subplot for line in data['evolution']: line['legendgroup'] = line['name'] line['marker'] = {'color': colors[line['name']]} figure.append_trace(line, 1, 1) # Add all boxplots in second subplot for boxplot in data['performance'][criterion]: boxplot['legendgroup'] = boxplot['name'] boxplot['marker'] = {'color': colors[boxplot['name']]} boxplot['name'] = '' figure.append_trace(boxplot, 3, 1) # Format of x axes for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items(): figure['layout'][axis].update(tickvals=year_values, ticktext=year_labels) # Format of y axes figure['layout']['yaxis1'].update(title='Number of runners', tickformat='f') figure['layout']['yaxis2'].update(title=criterion) if criterion.lower() == 'time': figure['layout']['yaxis2'].update(type='date', tickformat='%H:%M') # Use of group for boxplots to avoid superposition figure['layout'].update(boxmode='group') # Update of size, margins and legend attributes of figure figure['layout'].update(width=1000, height=600, margin=go.Margin(t=50, b=50, l=50, r=50)) figure['layout']['legend'].update(y=0.5) return figure
def plot_ols_fitted_and_true_values(data, ols_results, x=None, y='Median age (all runnings)', title='Fitted and original values for median ages of participants of Lausanne Marathon editions', groupby_column='Gender', markers_attributes=None): ''' This function plots fitted and true values for a given dataset and associated ols results. Parameters - data: DataFrame containing data to use for original values - x: Name of the column to use for x axis for original values and fitted values (by default None / if None, index will be used for original values and index name for retrieve_ols_predictions_and_errors) - y: Name of the column to use for y axis (by default, 'Median age (all runnings)') - title: Title of the graph (by default, 'Fitted and original values for median ages of participants of Lausanne Marathon editions') - groupby_column: Name of the column to use for grouping data (by default, 'Gender') - markers_attributes: Dictionary containing options for each unique value in column groupby_column (by default, None) (at present, 'title' (title of subplot), 'color_true' (color of markers of original values), 'color_fitted' (color of markers of fitted values), 'color_errors' (color of errors bars), 'name_true' (name of markers' legend of original values) and 'name_fitted' (name of of markers' legend of fitted values) are supported) Return - figure: Plotly figure ''' position = {} predictions_and_errors = study_utils.retrieve_ols_predictions_and_errors(ols_results=ols_results, regressor=(x if x else data.index.name)) # We ignore outputs of Plotly with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=([(markers_attributes[key].get('title', None) if markers_attributes else None) for key in predictions_and_errors])) i = 1 for key, results in predictions_and_errors.items(): position[key] = i name=(markers_attributes[key].get('name_fitted', 'Fitted values') if markers_attributes else 'Fitted values') errors = {'type': 'data', 'symmetric': False, 'array': np.subtract(np.array(results['errors']['max']), np.array(results['predictions'])), 'arrayminus': abs(np.subtract(np.array(results['errors']['min']), np.array(results['predictions']))), 'color': (markers_attributes[key].get('color_errors', None) if markers_attributes else None)} markers = go.Scatter(x=results['x'], y=results['predictions'], mode='markers', hoverinfo='y+text', name=name, text=name, marker={'color': (markers_attributes[key].get('color_fitted', None) if markers_attributes else None)}, legendgroup='fitted', error_y=errors, showlegend=(position[key] == 1)) figure.append_trace(markers, 1, position[key]) i += 1 for key, group in data.groupby([groupby_column]): x_values = group[x] if x else group.index name = (markers_attributes[key].get('name_true', 'Original values (' + key + ')') if markers_attributes else 'Original values (' + key + ')') markers = go.Scatter(x=x_values, y=group[y], mode='markers', hoverinfo='y+text', name=name, text=name, marker={'color': (markers_attributes[key].get('color_true', None) if markers_attributes else None)}) figure.append_trace(markers, 1, position[key]) # Add of title and format of axes figure['layout'].update(title=title) figure['layout']['yaxis'].update(title=y) for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items(): figure['layout'][axis].update(title=(x if x else data.index.name)) plotly.offline.iplot(figure) return figure
def generate_comparison(data, title, running_type_column_name, x_column_name, y_column_name, colors): ''' This function generates comparison boxplots according to given parameters, for each running of Lausanne Marathon. Parameters - data: DataFrame containing information on runners - title: Title of the graph - running_type_column_name: Name of the column containing the runnings - x_column_name: Name of column to be used for x data - y_column_name: Name of column to be used for y data - colors: Dict containing colors associated to each unique value of x_column_name Return - figure: Plotly figure ''' runnings = OrderedDict([(10, { 'name': '10 km', 'position': 1 }), (21, { 'name': 'Semi-marathon', 'position': 2 }), (42, { 'name': 'Marathon', 'position': 3 })]) # We create final figure (outputs are ignored) with study_utils.ignore_stdout(): figure = tools.make_subplots( rows=1, cols=3, shared_yaxes=True, subplot_titles=([ attributes['name'] for km, attributes in runnings.items() ])) # Loop over runnings of Lausanne Marathon for running, attributes in runnings.items(): filtered_df = data[data[running_type_column_name] == running] # We loop over gender of participants for value in np.concatenate( (['all'], filtered_df[x_column_name].unique()), axis=0): if value == 'all': filtered_df_final = filtered_df else: filtered_df_final = filtered_df[filtered_df[x_column_name] == value] # We append a new boxplot corresponding to a gender (or all participants) for the considered running figure.append_trace( go.Box(y=filtered_df_final[y_column_name], name=value.capitalize() + ' runners', marker={'color': colors[value]}, legendgroup=value, showlegend=(attributes['position'] == 1)), 1, attributes['position']) # Format of y and x axes and modification of layout for axis, _ in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items(): figure['layout'][axis].update(showticklabels=False) figure['layout']['yaxis1'].update(title=y_column_name.capitalize(), tickformat='.2f') figure['layout'].update(title=title) plotly.offline.iplot(figure) return figure
def plot_speed_distribution_by_running(data, runnings=None, title='Speed distribution by running', speed_column_name='speed (m/s)', sex_column_name='sex'): ''' This function plots, for each running, the distribution of ages of runners based on the genders of participants. Parameters - data: DataFrame to use during generation of the distribution - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name, color) By default, None. If None, default values will be set by function. - title: Title of the graph (by default, 'Speed distribution by running') - speed_column_name: Name of the column containing age of participants('age' or 'age category', by default, 'speed (m/s)') - sex_column_name: Name of the column containing sex of participants (by default, 'sex') Return - figure: Plotly figure title, x axis name, statistics, column to select, categories or not, title of xaxis, title of yaxis (not modified) ''' if not runnings: runnings = { 'column_name': 'distance (km)', 'values': OrderedDict([(10, { 'name': '10 km', 'color': KM_10_COLOR, 'position': 1 }), (21, { 'name': 'Semi-marathon', 'color': KM_21_COLOR, 'position': 2 }), (42, { 'name': 'Marathon', 'color': KM_42_COLOR, 'position': 3 })]) } colors = { 'female': FEMALE_COLOR, 'male': MALE_COLOR, 'all': ALL_GENDERS_COLOR } statistics = {} with study_utils.ignore_stdout(): figure = tools.make_subplots( rows=3, cols=1, shared_xaxes=True, subplot_titles=([ attributes['name'] for km, attributes in runnings['values'].items() ])) for key, attributes in runnings['values'].items(): filtered_df = data[data[runnings['column_name']] == key] statistics[attributes['name']] = 'Total: ' + str( len(filtered_df)) + ' runners<br>Max: ' + str( round(np.max(filtered_df[speed_column_name]), 2)) + ' m/s<br>Min: ' + str( round(np.min(filtered_df[speed_column_name]), 2) ) + ' m/s<br>Median: ' + str( round(np.median(filtered_df[speed_column_name]), 2)) + ' m/s | SD: ' + str( round( np.std(filtered_df[speed_column_name]), 2)) + ' m/s' for sex in np.concatenate( (filtered_df[sex_column_name].unique(), ['all']), axis=0): if sex == 'all': x = filtered_df[speed_column_name] else: x = filtered_df[filtered_df[sex_column_name] == sex][speed_column_name] figure.append_trace( go.Histogram(xbins={ 'start': math.floor(np.min(data[speed_column_name])), 'end': math.ceil(np.max(data[speed_column_name])), 'size': 0.1 }, x=x, name=sex.capitalize() + ' runners', legendgroup=sex, showlegend=(attributes['position'] == 1), marker={'color': colors[sex]}, opacity=0.75), attributes['position'], 1) # Format of axes and layout figure.layout.xaxis1.update(title='Speed (m/s)', tickformat='.1f') figure.layout.yaxis2.update(title='Number of participants') figure.layout.update(title=title, barmode='stack', bargroupgap=0.1, bargap=0, margin=go.Margin(t=100, b=50, l=50, r=50)) # Add of statistics # Trick: We use position of subtitles annotations to create the ones related to statistics annotations_statistics = [] for annotation in figure['layout']['annotations']: annotations_statistics.append( Annotation(y=annotation['y'] - 0.12, x=1, align='left', text=statistics[annotation['text']], xref='paper', yref='paper', yanchor='bottom', showarrow=False)) figure['layout']['annotations'].extend(annotations_statistics) plotly.offline.iplot(figure) return figure
def generate_performance_by_age_and_age_category( data, runnings=None, age_column_name='age', age_category_column_name='age category', sex_column_name='sex'): ''' This function generates figures for each running. It displays time distribution according to age and age category. Final Dict has the following pattern: { <running_1>: <Plotly figure> [, <running_2>: <Plotly figure> , ...] } Parameters - data: DataFrame containing all the information of Lausanne Marathon 2016 - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name) By default, None. If None, default values will be set by function. - age_column_name: Name of the column containing age of participants(by default, 'age') - age_category_column_name: Name of the column containing age category of participants(by default, 'age category') - sex_column_name: Name of the column containing gender of participants (by default, 'sex') Return - figures: Dict containing all time distribution figures ''' # We create final dict and we set attributes and runnings (if not given by user) figures = {} attributes = { 'colors': { 'female': FEMALE_COLOR, 'male': MALE_COLOR, 'all': ALL_GENDERS_COLOR }, 'names': { 'female': 'Female runners', 'male': 'Male runners', 'all': 'All runners' }, 'visibility': { 'female': 'legendonly', 'male': 'legendonly', 'all': True } } if not runnings: runnings = { 'column_name': 'distance (km)', 'values': { 10: { 'name': '10 km' }, 21: { 'name': 'Semi-marathon' }, 42: { 'name': 'Marathon' } } } # Loop over runnings for km, attributes_running in runnings['values'].items(): filtered_df = data[data[runnings['column_name']] == km] # Creation of Plotly figure containing subplots (we ignore outputs here) with study_utils.ignore_stdout(): figure = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.1) # Consideration of ages and age categories for column_name in [age_column_name, age_category_column_name]: # Creation of boxplots for female and male runners boxplots = study_utils.create_plotly_boxplots( data=filtered_df, x=column_name, y='time', hue=sex_column_name, hue_names=attributes['names'], colors=attributes['colors'], visibility=attributes['visibility'], use_hue_names=False, use_legend_group=True, show_legend=(column_name == 'age')) # We add boxplots for all runners (without consideration of sex) boxplots.append( go.Box(y=filtered_df['time'], x=filtered_df[column_name], name=attributes['names']['all'], marker={'color': attributes['colors']['all']}, visible=attributes['visibility']['all'], legendgroup=attributes['names']['all'], showlegend=(column_name == 'age'))) # We add each generated boxplot in the correct subplot for boxplot in boxplots: figure.append_trace(boxplot, 1 if column_name == 'age' else 2, 1) # Format of y and x axes and modification of layout for axis, _ in { k: v for k, v in figure['layout'].items() if 'yaxis' in k }.items(): figure['layout'][axis].update(title='Performance time', type='date', tickformat='%H:%M:%S') figure['layout']['xaxis1'].update(title='Age', dtick=5) figure['layout']['xaxis2'].update(title='Age category', categoryorder='array', categoryarray=YEAR_CATEGORIES) figure['layout'].update(title='Distribution of time performance (' + attributes_running['name'] + ')', height=650, hovermode='closest') # We add newly created figure to the final dict figures[attributes_running['name']] = figure return figures
def plot_distribution_age_distance(data, runnings=None, title='Distribution of runners by age', age_column_name='age', sex_column_name='sex'): ''' This function plots, for each running, the distribution of ages of runners based on the genders of participants. Parameters - data: DataFrame to use during generation of the distribution - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name, color) By default, None. If None, default values will be set by function. - title: Title of the graph (by default, 'Distribution of runners by age categories') - age_column_name: Name of the column containing age of participants('age' or 'age category', by default, 'age') - sex_column_name: Name of the column containing sex of participants (by default, 'sex') Return - figure: Plotly figure ''' if not runnings: runnings = { 'column_name': 'distance (km)', 'values': OrderedDict([(10, { 'name': '10 km', 'color': KM_10_COLOR, 'position': 1 }), (21, { 'name': 'Semi-marathon', 'color': KM_21_COLOR, 'position': 2 }), (42, { 'name': 'Marathon', 'color': KM_42_COLOR, 'position': 3 })]) } colors = { 'female': FEMALE_COLOR, 'male': MALE_COLOR, 'all': ALL_GENDERS_COLOR } statistics = {} with study_utils.ignore_stdout(): figure = tools.make_subplots( rows=3, cols=1, subplot_titles=([ attributes['name'] for km, attributes in runnings['values'].items() ])) for key, attributes in runnings['values'].items(): filtered_df = data[data[runnings['column_name']] == key] statistics[attributes['name']] = 'Mean age: ' + str( round(np.mean(filtered_df[age_column_name]), 2)) + ' years (SD: ' + str( round(np.std(filtered_df[age_column_name]), 2)) + ')' for sex in np.concatenate( (filtered_df[sex_column_name].unique(), ['all']), axis=0): if sex == 'all': x = filtered_df[age_column_name] else: x = filtered_df[filtered_df[sex_column_name] == sex][age_column_name] nbinsx = ((np.max(x) - np.min(x)) + 1) if (age_column_name == 'age') else len(x) figure.append_trace( go.Histogram(nbinsx=nbinsx, x=x, name=sex.capitalize() + ' runners', legendgroup=sex, showlegend=(attributes['position'] == 1), marker={'color': colors[sex]}, opacity=0.75), attributes['position'], 1) # Format of axes and layout if age_column_name == 'age category': for axis, attributes in { k: v for k, v in figure['layout'].items() if 'xaxis' in k }.items(): figure['layout'][axis].update(categoryorder='array', categoryarray=YEAR_CATEGORIES) figure.layout.xaxis3.update(title='Age of participants') figure.layout.yaxis2.update(title='Number of participants') figure.layout.update(title=title, barmode='stack', bargroupgap=0.1, bargap=0, margin=go.Margin(t=100, b=50, l=50, r=50)) # Add of statistics # Trick: We use position of subtitles annotations to create the ones related to statistics annotations_statistics = [] for annotation in figure['layout']['annotations']: annotations_statistics.append( Annotation(y=annotation['y'], x=1, text=statistics[annotation['text']], xref='paper', yref='paper', yanchor='bottom', showarrow=False)) figure['layout']['annotations'].extend(annotations_statistics) plotly.offline.iplot(figure) return figure