def refresh_tokens():    
    
    processing_status = read_db('processing_status')

    for index, row in processing_status.iterrows():
       
        if row['athlete_id'] != 0 and row['status'] == "none":
             
            params = {
                "client_id": "40695",
                "client_secret": "[client secret]",
                "refresh_token": row['refresh_token'],
                "grant_type": "refresh_token"
            }
            
            r = requests.post("https://www.strava.com/oauth/token", params)             
           
            processing_status.at[index,'bearer_token'] = r.json()['access_token']
            processing_status.at[index,'refresh_token'] = r.json()['refresh_token']
    
    write_db_replace(processing_status,'processing_status')        
    
    # deleting an athlete by index:
    # processing_status = processing_status.drop(21)
    
    return 0
示例#2
0
def get_athlete_data_status(athlete_id):

    import pandas as pd

    processing_status = read_db('processing_status')

    if str(athlete_id) in processing_status["athlete_id"].values:
        ingest_status = processing_status[processing_status["athlete_id"] ==
                                          str(athlete_id)]["status"].values[0]
        return ingest_status

    return "to process"
示例#3
0
def queue_athlete_for_processing(athlete_id, bearer_token, refresh_token):

    import pandas as pd

    processing_status = read_db('processing_status')
    processing_status = processing_status.append(
        {
            'athlete_id': athlete_id,
            'status': 'none',
            'bearer_token': bearer_token,
            'refresh_token': refresh_token
        },
        ignore_index=True)
    write_db_replace(processing_status, 'processing_status')

    return "none"
def athletevsbest(athlete_id):
    """
    read initial data, initialise stuff
    """
    athlete_id = int(athlete_id)
    features_blocks = read_db('features_blocks')
    metadata_blocks = read_db('metadata_blocks')
    model_outputs = read_db('model_outputs')
    """
    grab last block by this athlete
    """
    this_athlete_blocks = features_blocks[features_blocks['athlete_id'] ==
                                          athlete_id]
    this_athlete_last_block = this_athlete_blocks.iloc[-1]
    """
    grab stats on this block - start date, end date, vdot, marathon prediction time
    """
    end_date = metadata_blocks[metadata_blocks['block_id'].astype(
        float) == float(
            this_athlete_last_block['block_id'])].iloc[0]['pb_date']
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S')
    start_date = end_date - datetime.timedelta(days=91)

    # output date: print('Date:', end_date.date())
    """
    get ordered list of nn features from db
    Here, we are looking at y_vdot as the model trainer
    """
    features = model_outputs[model_outputs['y_name'] == 'y_vdot']
    features = features.sort_values(['importance'], ascending=[0])
    top_ten_percent = features_blocks.sort_values(
        ['y_vdot'], ascending=[0]).head(round(0.1 * len(features_blocks)))
    bottom_ten_percent = features_blocks.sort_values(
        ['y_vdot'], ascending=[1]).head(round(0.1 * len(features_blocks)))

    visualisation_outputs = pd.DataFrame()

    for index, feature in features.head(20).iterrows():

        feature_name = feature['feature_name']
        feature_importance = feature['importance']
        """
        athlete's score for this feature
        and percentile
        """

        athlete_score = round(this_athlete_last_block[feature_name], 2)
        top_ten_percent_value = top_ten_percent[feature_name].mean()
        bottom_ten_percent_value = bottom_ten_percent[feature_name].mean()

        # skipping broken features
        if (feature_name == "f_proportion_other"
                or feature_name == "r_proportion_other"):
            continue

        # last minute data cleaning
        if (top_ten_percent_value == 0.0 and bottom_ten_percent_value == 0.0):
            continue
        if (math.isnan(athlete_score)):
            athlete_score = 0.0
        if (math.isnan(top_ten_percent_value)
                or math.isnan(bottom_ten_percent_value)):
            continue

        perc_compare_top = top_ten_percent_value
        perc_compare_bottom = bottom_ten_percent_value

        if (bottom_ten_percent_value > top_ten_percent_value):
            print(feature_name + ' SWAP')
            switch = perc_compare_top
            perc_compare_top = perc_compare_bottom
            perc_compare_bottom = switch

            if (athlete_score > perc_compare_bottom):
                athlete_percentile = 5
            elif (athlete_score < perc_compare_top):
                athlete_percentile = 95
            else:
                athlete_percentile = 100 * ((
                    ((athlete_score - bottom_ten_percent_value) /
                     (top_ten_percent_value - bottom_ten_percent_value)) * 0.8)
                                            + 0.1)

        else:

            if (athlete_score < perc_compare_bottom):
                athlete_percentile = 5
            elif (athlete_score > perc_compare_top):
                athlete_percentile = 95
            else:
                athlete_percentile = 100 * ((
                    ((athlete_score - bottom_ten_percent_value) /
                     (top_ten_percent_value - bottom_ten_percent_value)) * 0.8)
                                            + 0.1)

        athlete_need = feature_importance * (100 - athlete_percentile)

        visualisation_outputs = visualisation_outputs.append(
            {
                'feature_name': feature_name,
                'feature_importance': feature_importance,
                'athlete_score': athlete_score,
                'athlete_percentile': athlete_percentile,
                'athlete_need': athlete_need,
                'tenth': 10,
                'ninetieth': 90,
                'one-hundredth': 100,
                'value_at_tenth': round(bottom_ten_percent_value, 2),
                'value_at_ninetieth': round(top_ten_percent_value, 2)
            },
            ignore_index=True)

    visualisation_outputs = visualisation_outputs.sort_values(
        by=['athlete_need'], ascending=False)
    visualisation_outputs = visualisation_outputs.iloc[::-1]
    visualisation_outputs = visualisation_outputs.reset_index()

    plt.style.use(u'seaborn-darkgrid')
    plt.title(
        "Your performance relative to the best athletes \n For 3 months before your last PB, between "
        + str(start_date.date()) + " and " + str(end_date.date()) +
        "\nOrdered by how much each aspect would help your fitness")

    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(12, 12.5)

    labels = []

    for index, feature in visualisation_outputs.iterrows():
        labels.append(feature_labels[feature['feature_name']] + "\n" +
                      str(feature['athlete_score']))

    ax = fig.add_subplot(111)

    ax.barh(labels,
            visualisation_outputs['one-hundredth'],
            tick_label=labels,
            height=0.8,
            color='#afffd3')
    ax.barh(labels,
            visualisation_outputs['ninetieth'],
            tick_label=labels,
            height=0.8,
            color='#bbbbc1')
    ax.barh(labels,
            visualisation_outputs['tenth'],
            tick_label=labels,
            height=0.8,
            color='#ffa4a4')

    ax.plot(visualisation_outputs['athlete_percentile'],
            labels,
            marker=10,
            markersize=15,
            linestyle="",
            label=visualisation_outputs['athlete_score'])

    for index, feature in visualisation_outputs.iterrows():
        ax.text(x=float(11),
                y=index,
                s=feature['value_at_tenth'],
                horizontalalignment="left")
        ax.text(x=float(89),
                y=index,
                s=feature['value_at_ninetieth'],
                horizontalalignment="right")

    plt.xlabel(
        'Percentile. 0% = the worst performing athlete. 100% = the best performing athlete.'
    )
    plt.tight_layout()

    bytes_image = io.BytesIO()
    plt.savefig(bytes_image, format='png')
    bytes_image.seek(0)
    plt.clf()
    plt.cla()
    plt.close()
    return bytes_image
def cron_train_nn():

    model_outputs = pd.DataFrame()
    
    features_blocks = read_db('features_blocks')

    df = features_blocks #just to make debugging easier with same names 

    X = df.iloc[:, 2:-2]
    X['r_proportion_alpine_ski'] = 0
    X['r_proportion_crossfit'] = 0

    X = np.where(np.isnan(X), ma.array(X, mask=np.isnan(X)).mean(axis=0), X)

    """
    PREDICT ABSOLUTE VDOT
    """

    y = df.iloc[:, -2] #-1: change in vdot. #2: absolute vdot

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

    """
    XGBOOST
    
    from xgboost import XGBRegressor
    regressor = XGBRegressor()

    regressor.fit(X_train,y_train)

    # Predicting a new result
    y_pred = regressor.predict(X_test)
    y_actual = y_test

    print('xgboost single score:')
    print(r2_score(y_actual, y_pred)) 
    """

    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 50) 
    regressor.fit(X_train,y_train)

    # Predicting a new result
    y_pred = regressor.predict(X_test)
    y_actual = y_test


    model_score = r2_score(y_actual, y_pred)
    features = list(df.columns.values[2:-2])
    importances = regressor.feature_importances_
    indices = np.argsort(importances)

    for index in indices:        
        y_name = df.columns.values[-2]
        feature_name = features[index]
        importance = importances[index]
        model_score = model_score
        model_run_date = str(date.today())
        model_outputs = model_outputs.append({'y_name': y_name,
                                                      'feature_name': feature_name,
                                                      'importance': importance,
                                                      'model_score': model_score,
                                                      'model_run_date': model_run_date}, ignore_index = True)

    """
    PREDICT CHANGE IN VDOT
    """

    y = df.iloc[:, -1] #-1: change in vdot. #2: absolute vdot

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

    """
    XGBOOST
    
    from xgboost import XGBRegressor
    regressor = XGBRegressor()

    regressor.fit(X_train,y_train)

    # Predicting a new result
    y_pred = regressor.predict(X_test)
    y_actual = y_test

    print('xgboost single score:')
    print(r2_score(y_actual, y_pred)) 
    """

    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 50) 
    regressor.fit(X_train,y_train)

    # Predicting a new result
    y_pred = regressor.predict(X_test)
    y_actual = y_test


    model_score = r2_score(y_actual, y_pred)
    features = list(df.columns.values[2:-2])
    importances = regressor.feature_importances_
    indices = np.argsort(importances)

    for index in indices:        
        y_name = df.columns.values[-1]
        feature_name = features[index]
        importance = importances[index]
        model_score = model_score
        model_run_date = str(date.today())
        model_outputs = model_outputs.append({'y_name': y_name,
                                                      'feature_name': feature_name,
                                                      'importance': importance,
                                                      'model_score': model_score,
                                                      'model_run_date': model_run_date}, ignore_index = True)

    """
    SAVE RESULTS
    """
    
    write_db_replace(model_outputs, 'model_outputs')         
    
    
    """ 
    OUTPUTS FOR MY OWN ANALYSIS
    """

    import shap
    explainer = shap.TreeExplainer(regressor)
    shap_values = explainer.shap_values(X)
    shap.summary_plot(shap_values, df.iloc[:, 2:-2])
   
    shap_values = shap.TreeExplainer(regressor).shap_values(X_train)
    shap.summary_plot(shap_values, df.iloc[:, 2:-2], plot_type="bar")
	    
    """
    Temporary measure for building model with small datasets: doing 10 train/test splits and taking average r^2
    """

    r2_total = 0
    best_r2 = -50

    for i in range(1,500):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
        regressor = RandomForestRegressor(n_estimators = 50) 
        regressor.fit(X_train,y_train)
        y_pred = regressor.predict(X_test)
        y_actual = y_test
        r2_total += r2_score(y_actual, y_pred)
        if(r2_score(y_actual, y_pred) > best_r2):
            best_r2 = r2_score(y_actual, y_pred)

    r2 = r2_total / (i-1)
    print('random forest on ' + str(i) + ' train/test splits:' + str(r2))
    print('best random forest on ' + str(i) + ' train/test splits:' + str(best_r2))

    return 0
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 31 19:16:48 2019

@author: rian-van-den-ander
"""
"""
CHECK PROCESSING STATUS AND THEN PROCESS ALL ATHLETES
"""

from sql_methods import read_db, write_db_replace, write_db_insert
processing_status = read_db('processing_status')
"""
RYAN: FOR NOW, MANUALLY REFRESH TOKENS - something is funky there
"""
from cron_update_data import refresh_tokens
refresh_tokens()

#process (FROM STRAVA API) all 'none' athletes
from cron_update_data import update_data
from cron_train_nn import cron_train_nn
update_data()
cron_train_nn()
"""
DELETE AN ATHLETE
"""

import pandas as pd
from sql_methods import read_db, write_db_replace, write_db_insert
from cron_train_nn import cron_train_nn
示例#7
0
def feature_engineer(athlete_id, athlete_data, populate_all_from_files=0):
    """
    Dataframe description:
        metadata_athletes: just athlete name, zones, etc. not for model training
        metadata_blocks: same, at block level
        
        features_activities: features per activity. trying not to lose any data at this level, so it's not turned into HR zones, etc
        features_weeks: a week of training, losing/averaging some data
        features_blocks: a block of data, the main dataset that predicts an increase in vdot. a lot of data averaging, cleaning etc 
    """

    if (populate_all_from_files == 1):
        file = './data/' + str(athlete_id) + '.txt'

        f = open(file, 'r', encoding="utf8")
        file_data = f.read()
        athlete_data = ast.literal_eval(file_data)
        f.close()

    metadata_athletes = pd.DataFrame()
    metadata_blocks = pd.DataFrame()
    all_athlete_activities = pd.DataFrame()
    all_athlete_weeks = pd.DataFrame()
    features_activities = pd.DataFrame()
    features_weeks = pd.DataFrame()
    features_blocks = pd.DataFrame()
    regressors = dict()
    average_paces_and_hrs = pd.DataFrame()

    dict_data = athlete_data
    """
    THEN POPULATE AGAIN, with estimated heart rate data
    """
    """
    DATA QUALITY 
    Unfortunately, heart rate data must be prepopulated from pace in a separate loop  
    A pace -> heart rate regressor for each athlete
    
    TODO Later: per athlete per block, as opposed to one for each athlete
    """
    """ 
    Catch very bad data - 
    """
    try:
        bad_data_test = dict_data['sex']
    except Exception:
        print('bad data')
        # todo: put back - return 0

    activities = dict_data['_Activities']
    error_free_activities = []
    for activity in activities:

        try:
            if len(activity['errors']) > 0:
                pass
            else:
                error_free_activities.append(activity)
        except Exception:
            error_free_activities.append(activity)
    """
    GET ATHLETE ZONES IN A SIMPLE FORMAT     
    z1 = 0-> zones[0], z2 = zones[0]->zones[1], ..., z5 = zones[3] -> inf
    """
    try:

        zones_raw = dict_data['_Zones']['heart_rate']['zones']
        zones = []
        zones.append(zones_raw[0]['max'])
        zones.append(zones_raw[1]['max'])
        zones.append(zones_raw[2]['max'])
        zones.append(zones_raw[3]['max'])

    except Exception:

        zones = []
        zones.append[round(190 * 0.6, 0)]
        zones.append[round(190 * 0.7, 0)]
        zones.append[round(190 * 0.8, 0)]
        zones.append[round(190 * 0.9, 0)]

    athlete_id = dict_data['id']

    from running_functions import build_pace_to_hr_regressor

    regressor, not_nan_rows = build_pace_to_hr_regressor(
        activities, athlete_id, zones)
    regressors[athlete_id] = regressor
    average_paces_and_hrs = average_paces_and_hrs.append(not_nan_rows)
    #TODO later: can use this for average regression if athlete has no HR data. But currently never the pace

    this_athlete_metadata_blocks = pd.DataFrame()
    """
    SAVE ATHLETE METADATA
    """
    metadata_athletes = metadata_athletes.append(
        pd.DataFrame([[
            dict_data['id'], dict_data['sex'], dict_data['weight'],
            dict_data['_Zones']['heart_rate']['zones']
        ]],
                     columns=['id', 'sex', 'weight', 'zones']))
    """
    GET ALL ATHLETE ACTIVITIES AND WEEKS - For the 'average' to compare PB training blocks to
    """

    from search_functions import get_weeks
    from running_functions import extract_activity_features
    from running_functions import extract_week_features

    activities = list(error_free_activities)
    activities.reverse()

    weeks = get_weeks(activities, duration_days=0)

    week_count = 0

    for week in weeks:

        week_id = str(0) + "_" + str(week_count)
        block_id = str(0)

        for activity in week:

            activity_type = activity['type']
            hr_regressor = regressors[athlete_id]
            all_athlete_activities = extract_activity_features(
                all_athlete_activities, activity, zones, activity_type,
                dict_data['id'], block_id, week_id, hr_regressor)

        week_count += 1

    block_id = 0
    athlete_id = dict_data['id']
    block_activities = all_athlete_activities.loc[
        all_athlete_activities['athlete_id'] == athlete_id]
    week_ids = list(set(block_activities['week_id'].values.tolist()))
    """
    Extracting feature data for each week
    """
    for week_id in week_ids:

        week_activities = all_athlete_activities.loc[
            all_athlete_activities['week_id'] == week_id]
        week_runs = week_activities.loc[week_activities['activity_type'] == 2]
        week_non_runs = week_activities.loc[
            week_activities['activity_type'] != 2]

        f_total_runs = len(week_runs)
        to_append = extract_week_features(week_runs, week_non_runs, athlete_id,
                                          block_id, week_id, f_total_runs)
        all_athlete_weeks = all_athlete_weeks.append(to_append,
                                                     ignore_index=True)
    """
    GET SIGNIFICANT PBS
    Significant_pbs: list of activities that were signifcant pb
    list of:
        vdot (if > 0.5 after the last pb)
        predicted marathon time (hours)
        date of pb (at least 30 days after the last one)
        activity id            
    """

    from running_functions import get_pbs

    significant_pbs = get_pbs(activities)
    """
    SPLIT PBS INTO BLOCKS
    - chosen block size of 3 months to analyse as a long enough, but not too long, buildup time
    - discarding those with < 10 activities
    """

    from search_functions import get_block

    _minimum_activities_per_block = 10
    raw_json_blocks = []

    i = -1

    for pb in significant_pbs:
        i += 1
        activity_date = pb[2]
        block = get_block(activities, activity_date)
        if (len(block) >= _minimum_activities_per_block):
            raw_json_blocks.append(block)

            block_id = pb[3]

            vdot_delta = 0
            if (i == 0):
                vdot_delta = significant_pbs[i + 1][0] - significant_pbs[i][0]
            else:
                vdot_delta = significant_pbs[i][0] - significant_pbs[i - 1][0]

            this_athlete_metadata_blocks = this_athlete_metadata_blocks.append(
                {
                    'athlete_id': dict_data['id'],
                    'vdot': pb[0],
                    'vdot_delta': vdot_delta,
                    'predicted_marathon_time': pb[1],
                    'pb_date': pb[2],
                    'block_id': block_id
                },
                ignore_index=True)
            metadata_blocks = metadata_blocks.append(
                {
                    'athlete_id': dict_data['id'],
                    'vdot': pb[0],
                    'vdot_delta': vdot_delta,
                    'predicted_marathon_time': pb[1],
                    'pb_date': pb[2],
                    'block_id': block_id
                },
                ignore_index=True)

            weeks = get_weeks(block)
            week_count = 0

            for week in weeks:

                week_id = str(block_id) + "_" + str(week_count)

                for activity in week:

                    activity_type = activity['type']
                    hr_regressor = regressors[athlete_id]
                    features_activities = extract_activity_features(
                        features_activities, activity, zones, activity_type,
                        athlete_id, block_id, week_id, hr_regressor)

                week_count += 1
    """
    Bubble up into training week, with 
    - relative speeds
    - relative HR zones
    - average stdevs
    
    Watch out for Nans - mean_hr, athlete_count, etc can be nan
    - what is my nan strategy?
    """

    for index, block in this_athlete_metadata_blocks.iterrows():

        block_id = block['block_id']
        athlete_id = block['athlete_id']
        block_activities = features_activities.loc[
            features_activities['block_id'] == block_id]
        week_ids = list(set(block_activities['week_id'].values.tolist()))

        for week_id in week_ids:

            week_activities = features_activities.loc[
                features_activities['week_id'] == week_id]
            week_runs = week_activities.loc[week_activities['activity_type'] ==
                                            2]
            week_non_runs = week_activities.loc[
                week_activities['activity_type'] != 2]

            f_total_runs = len(week_runs)
            to_append = extract_week_features(week_runs, week_non_runs,
                                              athlete_id, block_id, week_id,
                                              f_total_runs)
            features_weeks = features_weeks.append(to_append,
                                                   ignore_index=True)
    """
    Bubble weeks up into block
    """

    for index, block in this_athlete_metadata_blocks.iterrows():

        block_id = block['block_id']
        athlete_id = block['athlete_id']
        block_activities = features_activities.loc[
            features_activities['block_id'] == block_id]
        block_weeks = features_weeks.loc[features_weeks['block_id'] ==
                                         block_id]
        athlete_weeks = all_athlete_weeks.loc[all_athlete_weeks['athlete_id']
                                              == athlete_id]

        avg_total_runs = athlete_weeks['f_total_runs'].mean()
        avg_total_distance = block_weeks['f_total_run_distance'][:-2].mean()
        if avg_total_runs == 0 or avg_total_distance == 0:
            continue
        """
        y: vdot_delta
        """
        y_vdot_delta = block['vdot_delta']
        y_vdot = block['vdot']  #CONSTANT - not for initial model prediction
        """
        Distance ramp up
        """

        from scipy.stats import linregress
        total_run_distance = list(block_weeks['f_total_run_distance'][:-2])
        xaxis = range(len(total_run_distance))
        slope, intercept, r_value, p_value, std_err = linregress(
            xaxis, total_run_distance)
        f_slope_distances_before_taper = slope
        """
        Distance tapering        
        """
        mean_run_distance = block_weeks['f_total_run_distance'][:-2].mean()
        mean_taper_distance = block_weeks['f_total_run_distance'][-2:].mean()
        f_taper_factor_distance = mean_taper_distance / mean_run_distance
        """
        Time ramp up
        """
        total_run_time = list(block_weeks['f_total_run_time'][:-2])
        xaxis = range(len(total_run_time))
        slope, intercept, r_value, p_value, std_err = linregress(
            xaxis, total_run_time)
        f_slope_time_before_taper = slope
        """
        Time tapering        
        """
        mean_run_time = block_weeks['f_total_run_time'][:-2].mean()
        mean_taper_time = block_weeks['f_total_run_time'][-2:].mean()
        f_taper_factor_time = mean_taper_time / mean_run_time
        """
        HR ramp up
        """
        mean_hr = list(block_weeks['f_mean_run_hr'][:-2])
        xaxis = range(len(mean_hr))
        slope, intercept, r_value, p_value, std_err = linregress(
            xaxis, mean_hr)
        f_slope_hr_before_taper = slope
        """
        HR ramp up        
        """
        mean_run_time = block_weeks['f_total_run_time'][:-2].mean()
        mean_taper_time = block_weeks['f_total_run_time'][-2:].mean()
        f_taper_factor_time = mean_taper_time / mean_run_time
        """
        HR tapering        
        """
        mean_run_hr = block_weeks['f_mean_run_hr'][:-2].mean()
        mean_taper_hr = block_weeks['f_mean_run_hr'][-2:].mean()
        f_taper_factor_hr = mean_taper_hr / mean_run_hr
        """
        Weekly distance, load - constant and relative (r_)
        """
        f_avg_weekly_run_distance = block_weeks['f_total_run_distance'].mean()
        avg_weekly_run_distance = athlete_weeks['f_total_run_distance'].mean()
        r_avg_weekly_run_distance = f_avg_weekly_run_distance / avg_weekly_run_distance

        f_avg_weekly_non_run_distance = block_weeks[
            'f_total_non_run_distance'].mean()
        avg_weekly_non_run_distance = athlete_weeks[
            'f_total_non_run_distance'].mean()

        f_avg_weekly_run_time = block_weeks['f_total_run_time'].mean()
        avg_weekly_run_time = athlete_weeks['f_total_run_time'].mean()

        f_avg_weekly_non_run_time = block_weeks['f_total_non_run_time'].mean()
        avg_weekly_non_run_time = athlete_weeks['f_total_non_run_time'].mean()

        if (avg_weekly_non_run_time == 0.0):
            r_avg_weekly_non_run_time = 0.0
        else:
            r_avg_weekly_non_run_time = f_avg_weekly_non_run_time / avg_weekly_non_run_time

        f_avg_total_runs = block_weeks['f_total_runs'].mean()
        avg_total_runs = athlete_weeks['f_total_runs'].mean()
        r_avg_total_runs = f_avg_total_runs / avg_total_runs

        f_avg_weekly_run_elevation = block_weeks['f_elevation'].mean()
        avg_weekly_run_elevation = athlete_weeks['f_elevation'].mean()
        r_avg_weekly_run_elevation = f_avg_weekly_run_elevation / avg_weekly_run_elevation

        f_mean_athlete_count = block_weeks['f_mean_athlete_count'].mean()
        mean_athlete_count = athlete_weeks['f_mean_athlete_count'].mean()
        r_mean_athlete_count = f_mean_athlete_count / mean_athlete_count

        f_avg_time_in_z1_runs = block_weeks['f_time_in_z1_runs'].mean()
        avg_time_in_z1_runs = np.nanmean(athlete_weeks['f_time_in_z1_runs'])
        r_avg_time_in_z1_runs = f_avg_time_in_z1_runs / avg_time_in_z1_runs

        f_avg_time_in_z2_runs = block_weeks['f_time_in_z2_runs'].mean()
        avg_time_in_z2_runs = np.nanmean(athlete_weeks['f_time_in_z2_runs'])
        r_avg_time_in_z2_runs = f_avg_time_in_z2_runs / avg_time_in_z2_runs

        f_avg_time_in_z3_runs = block_weeks['f_time_in_z3_runs'].mean()
        avg_time_in_z3_runs = np.nanmean(athlete_weeks['f_time_in_z3_runs'])
        r_avg_time_in_z3_runs = f_avg_time_in_z3_runs / avg_time_in_z3_runs

        f_avg_time_in_z4_runs = block_weeks['f_time_in_z4_runs'].mean()
        avg_time_in_z4_runs = np.nanmean(athlete_weeks['f_time_in_z4_runs'])
        r_avg_time_in_z4_runs = f_avg_time_in_z4_runs / avg_time_in_z4_runs

        f_avg_time_in_z5_runs = block_weeks['f_time_in_z5_runs'].mean()
        avg_time_in_z5_runs = np.nanmean(athlete_weeks['f_time_in_z5_runs'])
        r_avg_time_in_z5_runs = f_avg_time_in_z5_runs / avg_time_in_z5_runs
        """
        Amount of outlier activities - by ease, difficulty, intervaliness, length  - constant and relative (r_)
        """

        from running_functions import get_run_outliers
        f_num_distance_activities, f_num_intense_activities, f_num_varying_activities = get_run_outliers(
            features_activities, block_id, athlete_id)
        num_distance_activities, num_intense_activities, num_varying_activities = get_run_outliers(
            all_athlete_activities, '0', athlete_id)
        total_activities = len(all_athlete_activities)

        f_proportion_distance_activities = round(
            len(f_num_distance_activities) / len(block_activities), 2)
        proportion_distance_activities = round(
            len(num_distance_activities) / total_activities, 2)
        try:
            r_proportion_distance_activities = round(
                f_proportion_distance_activities /
                proportion_distance_activities, 2)
        except Exception:
            r_proportion_distance_activities = None

        f_proportion_intense_activities = round(
            len(f_num_intense_activities) / len(block_activities), 2)
        proportion_intense_activities = round(
            len(num_intense_activities) / total_activities, 2)
        try:
            r_proportion_intense_activities = round(
                f_proportion_intense_activities /
                proportion_intense_activities, 2)
        except Exception:
            r_proportion_intense_activities = None

        f_proportion_varying_activities = round(
            len(f_num_varying_activities) / len(block_activities), 2)
        proportion_varying_activities = round(
            len(num_varying_activities) / total_activities, 2)
        try:
            r_proportion_varying_activities = round(
                f_proportion_varying_activities /
                proportion_varying_activities, 2)
        except Exception:
            r_proportion_varying_activities = None
        """
        Proportions of main non-run types to runs  - constant and relative (r_)                
        """

        f_proportion_rides = round(
            len(block_activities[block_activities['activity_type'] == 1]) /
            len(block_activities), 2)
        proportion_rides = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 1]) / len(all_athlete_activities), 2)
        try:
            r_proportion_rides = f_proportion_rides / proportion_rides
        except Exception:
            r_proportion_rides = None

        f_proportion_swims = round(
            len(block_activities[block_activities['activity_type'] == 3]) /
            len(block_activities), 2)
        proportion_swims = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 3]) / len(all_athlete_activities), 2)
        try:
            r_proportion_swims = f_proportion_swims / proportion_swims
        except Exception:
            r_proportion_swims = None

        f_proportion_walks_hikes = round(
            len(block_activities[block_activities['activity_type'].isin(
                [4, 5])]) / len(block_activities), 2)
        proportion_walks_hikes = round(
            len(all_athlete_activities[
                all_athlete_activities['activity_type'].isin([4, 5])]) /
            len(all_athlete_activities), 2)
        try:
            r_proportion_walks_hikes = f_proportion_walks_hikes / proportion_walks_hikes
        except Exception:
            r_proportion_walks_hikes = None

        f_proportion_alpine_ski = round(
            len(block_activities[block_activities['activity_type'] == 6]) /
            len(block_activities), 2)
        proportion_alpine_ski = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 6]) / len(all_athlete_activities), 2)
        try:
            r_proportion_alpine_ski = f_proportion_alpine_ski / proportion_alpine_ski
        except Exception:
            r_proportion_alpine_ski = None

        f_proportion_workout = round(
            len(block_activities[block_activities['activity_type'] == 32]) /
            len(block_activities), 2)
        proportion_workout = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 32]) / len(all_athlete_activities),
            2)
        try:
            r_proportion_workout = f_proportion_workout / proportion_workout
        except Exception:
            r_proportion_workout = None

        f_proportion_yoga = round(
            len(block_activities[block_activities['activity_type'] == 34]) /
            len(block_activities), 2)
        proportion_yoga = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 34]) / len(all_athlete_activities),
            2)
        try:
            r_proportion_yoga = f_proportion_yoga / proportion_yoga
        except Exception:
            r_proportion_yoga = None

        f_proportion_crossfit = round(
            len(block_activities[block_activities['activity_type'] == 10]) /
            len(block_activities), 2)
        proportion_crossfit = round(
            len(all_athlete_activities[all_athlete_activities['activity_type']
                                       == 10]) / len(all_athlete_activities),
            2)
        try:
            r_proportion_crossfit = f_proportion_crossfit / proportion_crossfit
        except Exception:
            r_proportion_crossfit = None

        f_proportion_other = round(
            len(block_activities[block_activities['activity_type'].isin([
                7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34
            ])]) / len(block_activities), 2)
        proportion_other = round(
            len(all_athlete_activities[
                all_athlete_activities['activity_type'].isin([
                    7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34
                ])]) / len(all_athlete_activities), 2)
        try:
            r_proportion_other = f_proportion_other / proportion_other
        except Exception:
            r_proportion_other = None
        """
        NOT INCLUDED YET: 
            - Time between intense runs
            - 1 week tapers
            - relative cadence in activities
            - non-run outliers
        """

        features_blocks = features_blocks.append(
            {
                'athlete_id': athlete_id,
                'block_id': block_id,
                'f_slope_distances_before_taper':
                f_slope_distances_before_taper,
                'f_taper_factor_distance': f_taper_factor_distance,
                'f_slope_time_before_taper': f_slope_time_before_taper,
                'f_taper_factor_time': f_taper_factor_time,
                'f_slope_hr_before_taper': f_slope_hr_before_taper,
                'f_taper_factor_hr': f_taper_factor_hr,
                'f_avg_weekly_run_distance': f_avg_weekly_run_distance,
                'r_avg_weekly_run_distance': r_avg_weekly_run_distance,
                #'f_avg_weekly_non_run_distance': f_avg_weekly_non_run_distance,
                #'r_avg_weekly_non_run_distance': r_avg_weekly_non_run_distance,
                #'f_avg_weekly_run_time': f_avg_weekly_run_time,
                #'r_avg_weekly_run_time': r_avg_weekly_run_time,
                'f_avg_weekly_non_run_time': f_avg_weekly_non_run_time,
                'r_avg_weekly_non_run_time': r_avg_weekly_non_run_time,
                'f_avg_total_runs': f_avg_total_runs,
                'r_avg_total_runs': r_avg_total_runs,
                'f_avg_weekly_run_elevation': f_avg_weekly_run_elevation,
                'r_avg_weekly_run_elevation': r_avg_weekly_run_elevation,
                'f_mean_athlete_count': f_mean_athlete_count,
                'r_mean_athlete_count': r_mean_athlete_count,
                'f_avg_time_in_z1_runs': f_avg_time_in_z1_runs,
                'f_avg_time_in_z2_runs': f_avg_time_in_z2_runs,
                'f_avg_time_in_z3_runs': f_avg_time_in_z3_runs,
                'f_avg_time_in_z4_runs': f_avg_time_in_z4_runs,
                'f_avg_time_in_z5_runs': f_avg_time_in_z5_runs,
                'r_avg_time_in_z1_runs': r_avg_time_in_z1_runs,
                'r_avg_time_in_z2_runs': r_avg_time_in_z2_runs,
                'r_avg_time_in_z3_runs': r_avg_time_in_z3_runs,
                'r_avg_time_in_z4_runs': r_avg_time_in_z4_runs,
                'r_avg_time_in_z5_runs': r_avg_time_in_z5_runs,
                'f_proportion_distance_activities':
                f_proportion_distance_activities,
                'f_proportion_intense_activities':
                f_proportion_intense_activities,
                'f_proportion_varying_activities':
                f_proportion_varying_activities,
                'r_proportion_distance_activities':
                r_proportion_distance_activities,
                'r_proportion_intense_activities':
                r_proportion_intense_activities,
                'r_proportion_varying_activities':
                r_proportion_varying_activities,
                'f_proportion_rides': f_proportion_rides,
                'f_proportion_swims': f_proportion_swims,
                'f_proportion_walks_hikes': f_proportion_walks_hikes,
                'f_proportion_alpine_ski': f_proportion_alpine_ski,
                'f_proportion_workout': f_proportion_workout,
                'f_proportion_yoga': f_proportion_yoga,
                'f_proportion_crossfit': f_proportion_crossfit,
                'f_proportion_other': f_proportion_other,
                'r_proportion_rides': r_proportion_rides,
                'r_proportion_swims': r_proportion_swims,
                'r_proportion_walks_hikes': r_proportion_walks_hikes,
                'r_proportion_alpine_ski': r_proportion_alpine_ski,
                'r_proportion_workout': r_proportion_workout,
                'r_proportion_yoga': r_proportion_yoga,
                'r_proportion_crossfit': r_proportion_crossfit,
                'r_proportion_other': r_proportion_other,
                'y_vdot_delta': y_vdot_delta,
                'y_vdot': y_vdot
            },
            ignore_index=True)

    processing_status = read_db('processing_status')
    processing_status_index = processing_status[
        processing_status['athlete_id'] == str(int(
            athlete_id))].index.values.astype(int)[0]
    processing_status.at[processing_status_index, 'status'] = 'processed'
    write_db_replace(processing_status, 'processing_status')

    try:

        metadata_athletes = metadata_athletes.append(
            read_db('metadata_athletes'), ignore_index=True)
        metadata_blocks = metadata_blocks.append(read_db('metadata_blocks'),
                                                 ignore_index=True)
        all_athlete_activities = all_athlete_activities.append(
            read_db('all_athlete_activities'), ignore_index=True)
        all_athlete_weeks = all_athlete_weeks.append(
            read_db('all_athlete_weeks'), ignore_index=True)
        features_activities = features_activities.append(
            read_db('features_activities'), ignore_index=True)
        features_weeks = features_weeks.append(read_db('features_weeks'),
                                               ignore_index=True)
        features_blocks = features_blocks.append(read_db('features_blocks'),
                                                 ignore_index=True)
        average_paces_and_hrs = average_paces_and_hrs.append(
            read_db('average_paces_and_hrs'), ignore_index=True)

    except Exception as e:
        print(e)

    write_db_replace(metadata_athletes.applymap(str), 'metadata_athletes')
    write_db_replace(metadata_blocks.applymap(str), 'metadata_blocks')
    write_db_replace(all_athlete_activities, 'all_athlete_activities')
    write_db_replace(all_athlete_weeks, 'all_athlete_weeks')
    write_db_replace(features_activities, 'features_activities')
    write_db_replace(features_weeks, 'features_weeks')
    write_db_replace(features_blocks, 'features_blocks')
    write_db_replace(average_paces_and_hrs, 'average_paces_and_hrs')
def update_data():          
    
    #refresh_tokens()
      
    daily_limit = read_db('daily_limit')    
    
    current_api_calls = int(daily_limit.iloc[0,0])
    
    if (current_api_calls > 25000):
        print ("API LIMIT EXCEEDED")
        return "api limit exceeded"
    
    processing_status = read_db('processing_status')
    
    for index, row in processing_status.iterrows():
       
        athlete_id = int(row['athlete_id'])
        
        if athlete_id != 0 and row['status'] == "none":

                bearer_token = row['bearer_token']            
                print ('processing athlete ' + str(athlete_id))
                headers = {"Authorization": "Bearer " + bearer_token}
            
                processing_status.at[index, 'status'] = 'processing'
                
                try:
                    
                    """
                    GET ATHLETE DATA
                    ----------
                    """
                    url = 'https://www.strava.com/api/v3/athlete'
                    data = ''
                    headers = {"Authorization": "Bearer " + bearer_token}
                    response = requests.get(url, data=data, headers=headers)
                    athlete_data = response.json()           
                    
                    """
                    GET ATHLETE ZONES
                    -----------
                    """
                    url = 'https://www.strava.com/api/v3/athlete/zones'
                    data = ''
                    response = requests.get(url, data=data, headers=headers)
                    athlete_zones = response.json()                    
                    current_api_calls += 1
                    
                    
                    """
                    GET ATHLETE STATS
                    -----------
                    Not sure if any of this is relevant
                    """
                    url = 'https://www.strava.com/api/v3/athletes/' + str(athlete_id) + '/stats'
                    data = ''
                    response = requests.get(url, data=data, headers=headers)
                    athlete_stats = response.json()    
                    current_api_calls += 1                    
                    
                    """
                    GET ACTIVITY LIST
                    -----------------
                    """
                    url = 'https://www.strava.com/api/v3/athlete/activities?per_page=40&page=1'
                    data = ''
                    response = requests.get(url, data=data, headers=headers)
                    this_response = response.json()
                    activity_pg = this_response         
                    current_api_calls += 1

                    pg = 1
                    
                    while len(this_response) > 3:
                        start = time.time()
                        
                        pg += 1
                        url = 'https://www.strava.com/api/v3/athlete/activities?per_page=40&page=' + str(pg)
                        data = ''
                        response = requests.get(url, data=data, headers=headers)
                        this_response = response.json()
                        activity_pg = activity_pg + this_response
                        current_api_calls += 1    
                        
                        #rate limiting part 2
                        end = time.time()
                        remain = start + 1.5 - end
                        if remain > 0:
                            time.sleep(remain)
                            
                    print(activity_pg)
                    if (len(activity_pg) > 20):
                            
                        """
                        GET ALL ACTIVITIES FOR ATHLETE
                        ------------------------------
                        """
                        
                        activities = []
                        
                        for x in activity_pg:
                            #rate limiting part 1 
                            
                            start = time.time()
                            
                            activity_id = x['id']
                            url = 'https://www.strava.com/api/v3/activities/' + str(activity_id)
                            data = ''
                            response = requests.get(url, data=data, headers=headers)
                            this_response = response.json()
                            activities.append(this_response)
                            current_api_calls += 1  
                            
                            #rate limiting part 2
                            end = time.time()
                            remain = start + 1.5 - end
                            if remain > 0:
                                time.sleep(remain)
                            
                            """
                            CREATE ATHLETE FILE AND WRITE
                            -----------------------------
                            """
                            
                        athlete_data["_Zones"] = athlete_zones
                        athlete_data["_Stats"] = athlete_stats
                        athlete_data["_Activities"] = activities
                           
                    else:
                        return "athlete rejected - too few activities"
                    
                except Exception as ex:                    
                    daily_limit.at[0, 'daily'] = current_api_calls
                    write_db_replace(daily_limit,'daily_limit')                                
                    processing_status.at[index, 'status'] = 'none'
                    return ('failure processing athlete ' + str(row['athlete_id']) + ': ' + str(ex))          
                                                
                feature_engineer(athlete_id, athlete_data)
                
                daily_limit.at[0, 'daily'] = current_api_calls
                write_db_replace(daily_limit,'daily_limit')       

                print ('successfully processed athlete ' + str(athlete_id))     

                
    return "done / nobody to ingest"