def plot_gender_distributions_over_years(data, title='Gender distribution over the years for the different runnings of Lausanne Marathon'):
    '''
    This function generates a graph representing gender distributions over the years, given a data set.

    Parameters
        - data: Array containing all DataFrames to consider (one by gender)
        - title: Title of the graph (by default, 'Gender distribution over the years for different runnings of Lausanne Marathon')

    Return
        - figure: Plotly figure
    '''

    colors = {'10 km': KM_10_COLOR, 'Semi-marathon': KM_21_COLOR, 'Marathon': KM_42_COLOR}

    # We ignore outputs of Plotly
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(rows=1, cols=2, subplot_titles=([(sex.capitalize() + ' runners') for sex in data]))

    i = 1
    for sex, df in data.items():
        bars = []

        for running in df.columns:
            figure.append_trace(go.Bar(x=[year for year in df.index], y=df[running], name=running, marker={'color': colors[running]}, legendgroup=running, showlegend=(i == 1)), 1, i)

        i += 1

    figure['layout'].update(title=title, barmode='stack')
    plotly.offline.iplot(figure)
    return figure
def plot_performance_comparison(data, age_category, year, performance_criterion, silent=False):
    '''
    This function displays performance comparison graph given a set of data and a performance criterion.

    Parameters
        - data: Dict containing all the data used for performance comparison (see generate_performance_comparison)
        - age_category: string representing age category considered
        - year: year to compare with all other Lausanne Marathon editions
        - performance_criterion: Criterion to use for y axis
        - silent: If True, outputs are not displayed and figure is only returned
    '''
   
    criterion = performance_criterion.lower()

    # Creation of figure (output is ignored)
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(rows=1, cols=3, subplot_titles=([key for key, value in data[criterion].items()]))

    for index, boxplots in enumerate(data[criterion]):
        for boxplot in data[criterion][boxplots]:
            figure.append_trace(boxplot, 1, index+1)
    if criterion == 'time':
        for axis, attributes in {k: v for k, v in figure['layout'].items() if 'yaxis' in k}.items():
            figure['layout'][axis].update(type='date', tickformat='%H:%M:%S')
    figure['layout']['yaxis1'].update(title=performance_criterion)
    figure['layout'].update(title='Performance comparison between Lausanne Marathon ' + str(year) + ' and all Lausanne Marathon<br>(Age category: ' + age_category +')')
    if not silent:
        plotly.offline.iplot(figure)
    return figure
def generate_performance_distribution_figure(data, age_category, sex_category, criterion):
    '''
    This function generates performance distribution figure given a dataset.

    Parameters
        - data: DataFrame containing data to use
        - age_category: Selected age category
        - sex_category: Selected sex category
        - criterion: Selected criterion

    Return
        - figure: Plotly figure
    '''

    # We define the considered runnings and the years interval
    runnings = OrderedDict([(10, {'name': '10 km', 'position': 1}), (21, {'name': 'Semi-marathon', 'position': 2}), (42, {'name': 'Marathon', 'position': 3})])
    years_range = range(1999, 2017)
    years = {year: str(year) for year in years_range}
    colors = study_utils.generate_colors_palette(years)

    # We ignore outputs of Plotly
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(rows=3, subplot_titles=([attributes['name'] for km, attributes in runnings.items()]))

        for km, attributes in runnings.items():

            for year in years_range:
                data_filtered = data[(data['distance (km)'] == km) & (data['year'] == year)]

                if criterion == 'time':
                    group_data_filtered = data_filtered.set_index('time').groupby(pd.TimeGrouper(freq='5Min'))
                    x_values = [datetime.datetime.strptime(str(name), '%Y-%m-%d %H:%M:%S') for name, group in group_data_filtered]
                elif criterion == 'speed (m/s)':
                    group_data_filtered = data_filtered.round({'speed (m/s)': 1}).groupby('speed (m/s)')
                    x_values = [name for name, group in group_data_filtered]

                line = go.Scatter(mode='lines', x=x_values, y=[len(group) for name, group in group_data_filtered], name=str(year), legendgroup=str(year), marker={'color': colors[year]}, showlegend=(attributes['position'] == 1))

                figure.append_trace(line, attributes['position'], 1)

        # Format of x axes if time is selected criterion
        if criterion == 'time':
            
            for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items():
                figure['layout'][axis].update(type='date', tickformat='%H:%M')

        figure['layout'].update(width=1000, height=600, margin=go.Margin(t=100, b=50, l=50, r=50))
        figure['layout'].update(title='Performance distribution of Lausanne Marathon editions for all runnings<br>(Age category: ' + age_category +'  |  Sex category: ' + sex_category +')')
        figure['layout']['legend'].update(y=0.5)

        return figure
def generate_evolution_and_performance_figure(data, criterion):
    '''
    This function generates single evolution and performance figure given a set of data (see generate_evolution_and_performance_figures).

    Parameters
        - data: Data to be used for generation of figure
        - criterion: Performance criterion to be used

    Return
        - figure: Plotly figure representing evolution and performance in same graph
    '''

    year_values = [year for year in range(1999, 2017)]
    year_labels = [v for v in year_values]
    colors = {'10 km': KM_10_COLOR, 'Semi-marathon': KM_21_COLOR, 'Marathon': KM_42_COLOR}

    # We ignore outputs of Plotly
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(rows=4, specs=[[{'rowspan': 2}], [None], [{'rowspan': 2}], [None]], subplot_titles=('Evolution of runners over the years', 'Performance comparison by years'))

        # Add all lines in first subplot
        for line in data['evolution']:
            line['legendgroup'] = line['name']
            line['marker'] = {'color': colors[line['name']]}
            figure.append_trace(line, 1, 1)

        # Add all boxplots in second subplot
        for boxplot in data['performance'][criterion]:
            boxplot['legendgroup'] = boxplot['name']
            boxplot['marker'] = {'color': colors[boxplot['name']]}
            boxplot['name'] = ''
            figure.append_trace(boxplot, 3, 1)

        # Format of x axes
        for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items():
            figure['layout'][axis].update(tickvals=year_values, ticktext=year_labels)

        # Format of y axes
        figure['layout']['yaxis1'].update(title='Number of runners', tickformat='f')
        figure['layout']['yaxis2'].update(title=criterion)
        if criterion.lower() == 'time':
            figure['layout']['yaxis2'].update(type='date', tickformat='%H:%M')

        # Use of group for boxplots to avoid superposition
        figure['layout'].update(boxmode='group')

        # Update of size, margins and legend attributes of figure
        figure['layout'].update(width=1000, height=600, margin=go.Margin(t=50, b=50, l=50, r=50))
        figure['layout']['legend'].update(y=0.5)

        return figure
def plot_ols_fitted_and_true_values(data, ols_results, x=None, y='Median age (all runnings)', title='Fitted and original values for median ages of participants of Lausanne Marathon editions', groupby_column='Gender', markers_attributes=None):
    '''
    This function plots fitted and true values for a given dataset and associated ols results.

    Parameters
        - data: DataFrame containing data to use for original values
        - x: Name of the column to use for x axis for original values and fitted values (by default None / if None, index will be used for original values and index name for retrieve_ols_predictions_and_errors)
        - y: Name of the column to use for y axis (by default, 'Median age (all runnings)')
        - title: Title of the graph (by default, 'Fitted and original values for median ages of participants of Lausanne Marathon editions')
        - groupby_column: Name of the column to use for grouping data (by default, 'Gender')
        - markers_attributes: Dictionary containing options for each unique value in column groupby_column (by default, None)
        (at present, 'title' (title of subplot), 'color_true' (color of markers of original values), 'color_fitted' (color of markers of fitted values), 'color_errors' (color of errors bars), 'name_true' (name of markers' legend of original values) and 'name_fitted' (name of of markers' legend of fitted values) are supported)
    
    Return
        - figure: Plotly figure
    '''

    position = {}

    predictions_and_errors = study_utils.retrieve_ols_predictions_and_errors(ols_results=ols_results, regressor=(x if x else data.index.name))

    # We ignore outputs of Plotly
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=([(markers_attributes[key].get('title', None) if markers_attributes else None) for key in predictions_and_errors]))

    i = 1
    for key, results in predictions_and_errors.items():
        position[key] = i
        name=(markers_attributes[key].get('name_fitted', 'Fitted values') if markers_attributes else 'Fitted values')
        errors = {'type': 'data', 'symmetric': False, 'array': np.subtract(np.array(results['errors']['max']), np.array(results['predictions'])), 'arrayminus': abs(np.subtract(np.array(results['errors']['min']), np.array(results['predictions']))), 'color': (markers_attributes[key].get('color_errors', None) if markers_attributes else None)}
        markers = go.Scatter(x=results['x'], y=results['predictions'], mode='markers', hoverinfo='y+text', name=name, text=name, marker={'color': (markers_attributes[key].get('color_fitted', None) if markers_attributes else None)}, legendgroup='fitted', error_y=errors, showlegend=(position[key] == 1))
        figure.append_trace(markers, 1, position[key])
        i += 1

    for key, group in data.groupby([groupby_column]):
        x_values = group[x] if x else group.index
        name = (markers_attributes[key].get('name_true', 'Original values (' + key + ')') if markers_attributes else 'Original values (' + key + ')')
        markers = go.Scatter(x=x_values, y=group[y], mode='markers', hoverinfo='y+text', name=name, text=name, marker={'color': (markers_attributes[key].get('color_true', None) if markers_attributes else None)})
        figure.append_trace(markers, 1, position[key])

    # Add of title and format of axes
    figure['layout'].update(title=title)
    figure['layout']['yaxis'].update(title=y)
    for axis, attributes in {k: v for k, v in figure['layout'].items() if 'xaxis' in k}.items():
        figure['layout'][axis].update(title=(x if x else data.index.name))
    
    plotly.offline.iplot(figure)
    return figure
def generate_comparison(data, title, running_type_column_name, x_column_name,
                        y_column_name, colors):
    '''
    This function generates comparison boxplots according to given parameters, for each running of Lausanne Marathon.

    Parameters
        - data: DataFrame containing information on runners
        - title: Title of the graph
        - running_type_column_name: Name of the column containing the runnings
        - x_column_name: Name of column to be used for x data
        - y_column_name: Name of column to be used for y data
        - colors: Dict containing colors associated to each unique value of x_column_name

    Return
        - figure: Plotly figure
    '''

    runnings = OrderedDict([(10, {
        'name': '10 km',
        'position': 1
    }), (21, {
        'name': 'Semi-marathon',
        'position': 2
    }), (42, {
        'name': 'Marathon',
        'position': 3
    })])

    # We create final figure (outputs are ignored)
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(
            rows=1,
            cols=3,
            shared_yaxes=True,
            subplot_titles=([
                attributes['name'] for km, attributes in runnings.items()
            ]))

    # Loop over runnings of Lausanne Marathon
    for running, attributes in runnings.items():
        filtered_df = data[data[running_type_column_name] == running]

        # We loop over gender of participants
        for value in np.concatenate(
            (['all'], filtered_df[x_column_name].unique()), axis=0):
            if value == 'all':
                filtered_df_final = filtered_df
            else:
                filtered_df_final = filtered_df[filtered_df[x_column_name] ==
                                                value]

            # We append a new boxplot corresponding to a gender (or all participants) for the considered running
            figure.append_trace(
                go.Box(y=filtered_df_final[y_column_name],
                       name=value.capitalize() + ' runners',
                       marker={'color': colors[value]},
                       legendgroup=value,
                       showlegend=(attributes['position'] == 1)), 1,
                attributes['position'])

    # Format of y and x axes and modification of layout
    for axis, _ in {k: v
                    for k, v in figure['layout'].items()
                    if 'xaxis' in k}.items():
        figure['layout'][axis].update(showticklabels=False)
    figure['layout']['yaxis1'].update(title=y_column_name.capitalize(),
                                      tickformat='.2f')
    figure['layout'].update(title=title)
    plotly.offline.iplot(figure)
    return figure
def plot_speed_distribution_by_running(data,
                                       runnings=None,
                                       title='Speed distribution by running',
                                       speed_column_name='speed (m/s)',
                                       sex_column_name='sex'):
    '''
    This function plots, for each running, the distribution of ages of runners based on the genders of participants.

    Parameters
        - data: DataFrame to use during generation of the distribution
        - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name, color)
                    By default, None. If None, default values will be set by function.
        - title: Title of the graph (by default, 'Speed distribution by running')
        - speed_column_name: Name of the column containing age of participants('age' or 'age category', by default, 'speed (m/s)')
        - sex_column_name: Name of the column containing sex of participants (by default, 'sex')

    Return
        - figure: Plotly figure

        title, x axis name, statistics, column to select, categories or not, title of xaxis, title of yaxis (not modified)
    '''

    if not runnings:
        runnings = {
            'column_name':
            'distance (km)',
            'values':
            OrderedDict([(10, {
                'name': '10 km',
                'color': KM_10_COLOR,
                'position': 1
            }),
                         (21, {
                             'name': 'Semi-marathon',
                             'color': KM_21_COLOR,
                             'position': 2
                         }),
                         (42, {
                             'name': 'Marathon',
                             'color': KM_42_COLOR,
                             'position': 3
                         })])
        }
    colors = {
        'female': FEMALE_COLOR,
        'male': MALE_COLOR,
        'all': ALL_GENDERS_COLOR
    }
    statistics = {}
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(
            rows=3,
            cols=1,
            shared_xaxes=True,
            subplot_titles=([
                attributes['name']
                for km, attributes in runnings['values'].items()
            ]))

    for key, attributes in runnings['values'].items():
        filtered_df = data[data[runnings['column_name']] == key]
        statistics[attributes['name']] = 'Total: ' + str(
            len(filtered_df)) + ' runners<br>Max: ' + str(
                round(np.max(filtered_df[speed_column_name]),
                      2)) + ' m/s<br>Min: ' + str(
                          round(np.min(filtered_df[speed_column_name]), 2)
                      ) + ' m/s<br>Median: ' + str(
                          round(np.median(filtered_df[speed_column_name]),
                                2)) + ' m/s | SD: ' + str(
                                    round(
                                        np.std(filtered_df[speed_column_name]),
                                        2)) + ' m/s'
        for sex in np.concatenate(
            (filtered_df[sex_column_name].unique(), ['all']), axis=0):
            if sex == 'all':
                x = filtered_df[speed_column_name]
            else:
                x = filtered_df[filtered_df[sex_column_name] ==
                                sex][speed_column_name]
            figure.append_trace(
                go.Histogram(xbins={
                    'start':
                    math.floor(np.min(data[speed_column_name])),
                    'end':
                    math.ceil(np.max(data[speed_column_name])),
                    'size':
                    0.1
                },
                             x=x,
                             name=sex.capitalize() + ' runners',
                             legendgroup=sex,
                             showlegend=(attributes['position'] == 1),
                             marker={'color': colors[sex]},
                             opacity=0.75), attributes['position'], 1)

    # Format of axes and layout
    figure.layout.xaxis1.update(title='Speed (m/s)', tickformat='.1f')
    figure.layout.yaxis2.update(title='Number of participants')
    figure.layout.update(title=title,
                         barmode='stack',
                         bargroupgap=0.1,
                         bargap=0,
                         margin=go.Margin(t=100, b=50, l=50, r=50))

    # Add of statistics
    # Trick: We use position of subtitles annotations to create the ones related to statistics
    annotations_statistics = []
    for annotation in figure['layout']['annotations']:
        annotations_statistics.append(
            Annotation(y=annotation['y'] - 0.12,
                       x=1,
                       align='left',
                       text=statistics[annotation['text']],
                       xref='paper',
                       yref='paper',
                       yanchor='bottom',
                       showarrow=False))
    figure['layout']['annotations'].extend(annotations_statistics)

    plotly.offline.iplot(figure)
    return figure
def generate_performance_by_age_and_age_category(
        data,
        runnings=None,
        age_column_name='age',
        age_category_column_name='age category',
        sex_column_name='sex'):
    '''
    This function generates figures for each running. It displays time distribution according to age and age category.
    Final Dict has the following pattern:
    {
        <running_1>: <Plotly figure>
        [, <running_2>: <Plotly figure>
        , ...]
    }

    Parameters
        - data: DataFrame containing all the information of Lausanne Marathon 2016
        - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name)
                    By default, None. If None, default values will be set by function.
        - age_column_name: Name of the column containing age of participants(by default, 'age')
        - age_category_column_name: Name of the column containing age category of participants(by default, 'age category')
        - sex_column_name: Name of the column containing gender of participants (by default, 'sex')

    Return
        - figures: Dict containing all time distribution figures
    '''

    # We create final dict and we set attributes and runnings (if not given by user)
    figures = {}

    attributes = {
        'colors': {
            'female': FEMALE_COLOR,
            'male': MALE_COLOR,
            'all': ALL_GENDERS_COLOR
        },
        'names': {
            'female': 'Female runners',
            'male': 'Male runners',
            'all': 'All runners'
        },
        'visibility': {
            'female': 'legendonly',
            'male': 'legendonly',
            'all': True
        }
    }

    if not runnings:
        runnings = {
            'column_name': 'distance (km)',
            'values': {
                10: {
                    'name': '10 km'
                },
                21: {
                    'name': 'Semi-marathon'
                },
                42: {
                    'name': 'Marathon'
                }
            }
        }

    # Loop over runnings
    for km, attributes_running in runnings['values'].items():
        filtered_df = data[data[runnings['column_name']] == km]

        # Creation of Plotly figure containing subplots (we ignore outputs here)
        with study_utils.ignore_stdout():
            figure = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.1)

        # Consideration of ages and age categories
        for column_name in [age_column_name, age_category_column_name]:
            # Creation of boxplots for female and male runners
            boxplots = study_utils.create_plotly_boxplots(
                data=filtered_df,
                x=column_name,
                y='time',
                hue=sex_column_name,
                hue_names=attributes['names'],
                colors=attributes['colors'],
                visibility=attributes['visibility'],
                use_hue_names=False,
                use_legend_group=True,
                show_legend=(column_name == 'age'))
            # We add boxplots for all runners (without consideration of sex)
            boxplots.append(
                go.Box(y=filtered_df['time'],
                       x=filtered_df[column_name],
                       name=attributes['names']['all'],
                       marker={'color': attributes['colors']['all']},
                       visible=attributes['visibility']['all'],
                       legendgroup=attributes['names']['all'],
                       showlegend=(column_name == 'age')))
            # We add each generated boxplot in the correct subplot
            for boxplot in boxplots:
                figure.append_trace(boxplot, 1 if column_name == 'age' else 2,
                                    1)

        # Format of y and x axes and modification of layout
        for axis, _ in {
                k: v
                for k, v in figure['layout'].items() if 'yaxis' in k
        }.items():
            figure['layout'][axis].update(title='Performance time',
                                          type='date',
                                          tickformat='%H:%M:%S')
        figure['layout']['xaxis1'].update(title='Age', dtick=5)
        figure['layout']['xaxis2'].update(title='Age category',
                                          categoryorder='array',
                                          categoryarray=YEAR_CATEGORIES)
        figure['layout'].update(title='Distribution of time performance (' +
                                attributes_running['name'] + ')',
                                height=650,
                                hovermode='closest')

        # We add newly created figure to the final dict
        figures[attributes_running['name']] = figure

    return figures
def plot_distribution_age_distance(data,
                                   runnings=None,
                                   title='Distribution of runners by age',
                                   age_column_name='age',
                                   sex_column_name='sex'):
    '''
    This function plots, for each running, the distribution of ages of runners based on the genders of participants.

    Parameters
        - data: DataFrame to use during generation of the distribution
        - runnings: Dict containing name of column containing runnings (key: column_name) and set of runnings (key: values, value: dict() with following keys: name, color)
                    By default, None. If None, default values will be set by function.
        - title: Title of the graph (by default, 'Distribution of runners by age categories')
        - age_column_name: Name of the column containing age of participants('age' or 'age category', by default, 'age')
        - sex_column_name: Name of the column containing sex of participants (by default, 'sex')

    Return
        - figure: Plotly figure
    '''

    if not runnings:
        runnings = {
            'column_name':
            'distance (km)',
            'values':
            OrderedDict([(10, {
                'name': '10 km',
                'color': KM_10_COLOR,
                'position': 1
            }),
                         (21, {
                             'name': 'Semi-marathon',
                             'color': KM_21_COLOR,
                             'position': 2
                         }),
                         (42, {
                             'name': 'Marathon',
                             'color': KM_42_COLOR,
                             'position': 3
                         })])
        }
    colors = {
        'female': FEMALE_COLOR,
        'male': MALE_COLOR,
        'all': ALL_GENDERS_COLOR
    }
    statistics = {}
    with study_utils.ignore_stdout():
        figure = tools.make_subplots(
            rows=3,
            cols=1,
            subplot_titles=([
                attributes['name']
                for km, attributes in runnings['values'].items()
            ]))

    for key, attributes in runnings['values'].items():
        filtered_df = data[data[runnings['column_name']] == key]
        statistics[attributes['name']] = 'Mean age: ' + str(
            round(np.mean(filtered_df[age_column_name]),
                  2)) + ' years (SD: ' + str(
                      round(np.std(filtered_df[age_column_name]), 2)) + ')'
        for sex in np.concatenate(
            (filtered_df[sex_column_name].unique(), ['all']), axis=0):
            if sex == 'all':
                x = filtered_df[age_column_name]
            else:
                x = filtered_df[filtered_df[sex_column_name] ==
                                sex][age_column_name]
            nbinsx = ((np.max(x) - np.min(x)) +
                      1) if (age_column_name == 'age') else len(x)
            figure.append_trace(
                go.Histogram(nbinsx=nbinsx,
                             x=x,
                             name=sex.capitalize() + ' runners',
                             legendgroup=sex,
                             showlegend=(attributes['position'] == 1),
                             marker={'color': colors[sex]},
                             opacity=0.75), attributes['position'], 1)

    # Format of axes and layout
    if age_column_name == 'age category':
        for axis, attributes in {
                k: v
                for k, v in figure['layout'].items() if 'xaxis' in k
        }.items():
            figure['layout'][axis].update(categoryorder='array',
                                          categoryarray=YEAR_CATEGORIES)
    figure.layout.xaxis3.update(title='Age of participants')
    figure.layout.yaxis2.update(title='Number of participants')
    figure.layout.update(title=title,
                         barmode='stack',
                         bargroupgap=0.1,
                         bargap=0,
                         margin=go.Margin(t=100, b=50, l=50, r=50))

    # Add of statistics
    # Trick: We use position of subtitles annotations to create the ones related to statistics
    annotations_statistics = []
    for annotation in figure['layout']['annotations']:
        annotations_statistics.append(
            Annotation(y=annotation['y'],
                       x=1,
                       text=statistics[annotation['text']],
                       xref='paper',
                       yref='paper',
                       yanchor='bottom',
                       showarrow=False))
    figure['layout']['annotations'].extend(annotations_statistics)

    plotly.offline.iplot(figure)
    return figure