示例#1
0
def results_analysis(
    directory=config.MASTER_DIR,
    buckets=("Div", "Season"),
    stats=("HS", "AS", "HST", "AST", "FTHG", "FTAG"),
    filteron="HomeTeam",
    values=("Barcelona"),
    aggfunc="mean",
):
    """Customizable analysis of results.

    INPUT:
        directory: Directory containing processed results
        buckets: List of Fields to group by
        stats: Aggregated metrics to include
        filteron: Field to filter on
        values: List of values to filter on
        aggfunc: Aggregation method

    OUTPUT:
        selected: Analysis dataframe
    """
    logging.info("Results analysis")

    dframe = utilities.get_master("results", directory)

    # dframe.info()
    # dframe.describe(include="all")
    # print(dframe)

    # buckets = ['Div','Season']
    # stats = 'HS','AS','HST','AST','FTHG','FTAG'
    # filteron = 'HomeTeam'
    # values = ['Barcelona']
    # aggfunc = 'mean'

    pseudocode = (
        "SELECT "
        + aggfunc
        + " OF "
        + str(stats)
        + " WHERE "
        + filteron
        + " IS "
        + str(values)
        + " GROUPED BY "
        + str(buckets)
    )
    logging.info("Analysis pseudocode: {0}".format(pseudocode))

    selected = (
        dframe[dframe[filteron].isin(values)].groupby(buckets)[stats].agg(aggfunc)
    )

    # print(selected)
    logging.info(selected.describe(include="all"))
    # pd.scatter_matrix(selected, diagonal='kde')
    # plt.show()

    return selected
#

# In[ ]:

# ## 2. Data Understanding
#
# * Collect Initial Data
# * Describe Data
# * Explore Data
# * Verify Data Quality

# In[7]:

print("Loading Transfermarkt general information...")

tmk_df = utilities.get_master("players_contract")
# tmk_df.info()

# In[8]:

print("Random sample of records...")

tmk_df.sample(8, random_state=RANDOM_STATE)

# In[9]:

print("Summary of whole data source...")

tmk_df.describe(include="all")

# **ANALYSIS:** So the data is looking broadly in good shape, but there are a few missing values to consider...
示例#3
0
def calculate_quality(directory=config.MASTER_DIR):
    """Calculate data quality.

    INPUT:
        directory: Location of data to assess

    OUTPUT:
        overall_score: Overall quality score across all files
    """
    logging.info("Calculating data quality")

    calc_date = datetime.datetime.today().strftime("%Y-%m-%d")
    logging.info("Current date is {0}".format(calc_date))

    dq_data = []
    for file in os.listdir(directory):
        if file == "ftb_quality.txt":
            continue
        if not file.endswith(".txt"):
            continue
        logging.info("Assessing {0}".format(file))

        file_path = os.path.join(directory, file)

        file_date = datetime.datetime.fromtimestamp(
            os.path.getmtime(file_path)).strftime("%Y-%m-%d")
        logging.info("File modification date is {0}".format(calc_date))

        file_stub = file.replace("ftb_", "").replace(".txt", "")
        df = utilities.get_master(file_stub, directory=directory)
        if df.shape[0] > 50000:
            df = df.sample(50000, replace=False, random_state=42)

        no_of_rows, no_of_columns = df.shape
        no_of_cells = no_of_rows * no_of_columns

        # Consistency, coherence, or clarity
        category = "Consistency"
        logging.info("Running {0} tests".format(category))

        test = None
        if file in ["ftb_events_shot.txt"]:
            test = "xG between 0 and 1"
            score = 1 - (df[~df.statsbomb_xg.between(0, 1)].shape[0] /
                         df.shape[0])
        elif file in ["ftb_fulldata.txt"]:
            test = "Goals <= Shots"
            score = 1 - (df[df["Goals"] > df["Shots"]].shape[0] / df.shape[0])
        elif file in ["ftb_managers.txt"]:
            test = "DateFrom <= DateTo"
            score = 1 - (df[df.DateFrom > df.DateTo].shape[0] / df.shape[0])
        elif file in ["ftb_players_contract.txt"]:
            test = "Joined <= Contract expires"
            score = 1 - (df[df["Joined"] > df["Contract expires"]].shape[0] /
                         df.shape[0])
        elif file in ["ftb_players_performance.txt"]:
            test = "Games started <= In squad"
            score = 1 - (df[df["Games started"] > df["In squad"]].shape[0] /
                         df.shape[0])
        elif file in ["ftb_results.txt"]:
            test = "Home goals <= Home shots"
            score = 1 - (df[df["FTHG"] > df["HS"]].shape[0] / df.shape[0])
        elif file in ["ftb_nations_matches.txt"]:
            test = "Max one home team"
            score = 1 - (df[df["Home_1"] + df["Home_2"] == 2].shape[0] /
                         df.shape[0])

        if test:
            dq_data.append({
                "file": file_stub,
                "file_date": file_date,
                "calc_date": calc_date,
                "category": category,
                "test": test,
                "score": score,
            })

        # Completeness or comprehensiveness
        category = "Completeness"
        logging.info("Running {0} tests".format(category))

        test = "Missing values"
        score = df.count().sum() / no_of_cells
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Timeliness or latency
        category = "Timeliness"
        logging.info("Running {0} tests".format(category))

        test = "Days since file updated"
        score = max(
            1 - ((int(calc_date.replace("-", "")) -
                  int(file_date.replace("-", ""))) / 100000),
            0,
        )
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        date_field = None
        if file in [
                "ftb_fulldata.txt", "ftb_results.txt",
                "ftb_nations_matches.txt"
        ]:
            date_field = "Date"
        elif file in ["ftb_events_shot.txt"]:
            date_field = "match_date"
        elif file in ["ftb_managers.txt"]:
            date_field = "DateTo"

        if date_field:
            test = "Days since last match date"
            score = max(
                1 - ((int(calc_date.replace("-", "")) -
                      int(df[date_field].max().replace("-", ""))) / 100000),
                0,
            )
            dq_data.append({
                "file": file_stub,
                "file_date": file_date,
                "calc_date": calc_date,
                "category": category,
                "test": test,
                "score": score,
            })

        # Accuracy or correctness
        category = "Accuracy"
        logging.info("Running {0} tests".format(category))

        # Spots tests against ref data?
        # Wikipedia (Ajax, Frankfurt)
        # Don Balon (Spain 08/09)
        # SkySports Football Yearbook (England & Scotland 07/08)

        # Uniqueness
        category = "Uniqueness"
        logging.info("Running {0} tests".format(category))

        test = "Duplicated rows"
        score = df.drop_duplicates().shape[0] / no_of_rows
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        test = "Duplicated columns"
        score = df.T.drop_duplicates().T.shape[0] / no_of_rows
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Validity or reasonableness
        category = "Validity"
        logging.info("Running {0} tests".format(category))

        # 0 <= Goals <= 10

        test = "3 stdev from mean"
        score = 1 - (
            ((df < (df.mean() - 3 * df.std())) |
             (df > df.mean() + 3 * df.std())).sum().sum() / no_of_cells)
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        test = "1.5 IQR rule"
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        score = 1 - (((df < (Q1 - 1.5 * IQR)) |
                      (df > (Q3 + 1.5 * IQR))).sum().sum() / no_of_cells)
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Orderliness
        # Auditability
        # Conformity
        # accessibility or availability
        # comparability
        # credibility, reliability, or reputation
        # relevance, pertinence, or usefulness

    df_dq = pd.DataFrame(dq_data)
    df_dq["score"] = df_dq["score"].clip(lower=0, upper=1)
    utilities.save_master(df_dq, "quality", directory=directory)

    overall_score = df_dq.score.mean()
    logging.info("Overall score is {0}".format(overall_score))

    return overall_score
示例#4
0
def build_fulldata(directory=config.MASTER_DIR):
    """Combine results, stadiums and managers data into full dataset for clubs.

    INPUT:
        directory: Directory to save output to

    OUTPUT:
        fulldata: Dataframe containing all the clubs data
    """
    logging.info(
        "Building fulldata dataframe from results, stadiums, managers ...")

    home_renames = {
        "HomeTeam": "Team",
        "AwayTeam": "TeamOpp",
        "FTHG": "Goals",
        "FTAG": "GoalsOpp",
        "HTHG": "Goals1stHalf",
        "HTAG": "Goals1stHalfOpp",
        "HS": "Shots",
        "AS": "ShotsOpp",
        "HST": "ShotsOnTarget",
        "AST": "ShotsOnTargetOpp",
        "HHW": "ShotsHitWoodwork",
        "AHW": "ShotsHitWoodworkOpp",
        "HC": "Corners",
        "AC": "CornersOpp",
        "HF": "Fouls",
        "AF": "FoulsOpp",
        "HO": "Offsides",
        "AO": "OffsidesOpp",
        "HY": "YellowCards",
        "AY": "YellowCardsOpp",
        "HR": "RedCards",
        "AR": "RedCardsOpp",
        "HBP": "BookingPoints",
        "ABP": "BookingPointsOpp",
    }
    away_renames = {}
    for key, val in home_renames.items():
        if val.endswith("Opp"):
            away_renames[key] = val[:-3]
        else:
            away_renames[key] = val + "Opp"
    stat_to_diff = [
        "Goals",
        "Goals1stHalf",
        "Shots",
        "ShotsOnTarget",
        "ShotsHitWoodwork",
        "Corners",
        "Fouls",
        "Offsides",
        "YellowCards",
        "RedCards",
        "BookingPoints",
    ]
    # logging.debug(list(away_renames))
    # logging.debug(list(home_renames))

    logging.info("Process results")
    results = utilities.get_master("results", directory=directory)

    homeresults = results.rename(columns=home_renames)
    homeresults["HomeAway"] = "Home"
    # homeresults.info()
    # logging.debug(homeresults.describe(include="all"))

    awayresults = results.rename(columns=away_renames)
    awayresults["HomeAway"] = "Away"
    # awayresults.info()
    # logging.debug(homeresults.describe(include="all"))

    allresults = pd.concat([homeresults, awayresults],
                           ignore_index=True,
                           sort=False)
    allresults.drop(["FTR", "HTR", "Unnamed: 0"], axis=1, inplace=True)

    # logging.debug(allresults[(allresults['Team']=="Middlesbrough")&(allresults['Season']=="2006-2007")]["Date"].min())

    for stat in stat_to_diff:
        allresults[stat + "Diff"] = allresults[stat] - allresults[stat + "Opp"]
        allresults["Total" +
                   stat] = allresults[stat] + allresults[stat + "Opp"]

    allresults[
        "Saves"] = allresults["ShotsOnTargetOpp"] - allresults["GoalsOpp"]
    allresults["SavesOpp"] = allresults["ShotsOnTarget"] - allresults["Goals"]
    allresults["SavesDiff"] = allresults["Saves"] - allresults["SavesOpp"]
    allresults[
        "Goals2ndHalf"] = allresults["Goals"] - allresults["Goals1stHalf"]
    allresults["Goals2ndHalfOpp"] = (allresults["GoalsOpp"] -
                                     allresults["Goals1stHalfOpp"])
    allresults["Goals2ndHalfDiff"] = (allresults["GoalsDiff"] -
                                      allresults["Goals1stHalfDiff"])
    # Result,Points,PointsOpp,Win,WinDraw,Draw,DrawLoss,Loss,CleanSheet,CleanSheetOpp
    (
        allresults["Result"],
        allresults["Points"],
        allresults["PointsOpp"],
        allresults["Win"],
        allresults["WinDraw"],
        allresults["Draw"],
        allresults["DrawLoss"],
        allresults["Loss"],
        allresults["WinShare"],
    ) = zip(*allresults["GoalsDiff"].map(func_score))
    allresults["CleanSheet"] = allresults["Goals"].map(func_nogoal)
    allresults["CleanSheetOpp"] = allresults["GoalsOpp"].map(func_nogoal)

    # allresults['Date'] = pd.to_datetime(allresults['Date'], format="%d/%m/%y")

    allresults["GameWeek"] = (allresults.sort_values("Date").groupby(
        ["Season", "Div", "Team"]).cumcount() + 1)

    ## TODO - Validate derived values

    logging.info("Process stadiums")
    stadiums = utilities.get_master("stadiums", directory=directory)
    stadiums.drop(["Country", "TeamFull"], axis=1, inplace=True)

    fulldata = pd.merge(allresults, stadiums, on="Team", how="left")
    # fulldata.drop(['Unnamed: 0'], axis=1, inplace=True)
    stadiums.rename(columns={"Team": "TeamOpp"}, inplace=True)
    fulldata = pd.merge(fulldata,
                        stadiums,
                        on="TeamOpp",
                        how="left",
                        suffixes=("", "Opp"))
    fulldata.drop(["Unnamed: 0", "Unnamed: 0Opp"], axis=1, inplace=True)

    fulldata["EuclideanDistance"] = (
        (fulldata.Latitude - fulldata.LatitudeOpp)**2 +
        (fulldata.Longitude - fulldata.LongitudeOpp)**2)**0.5
    # logging.debug(100000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    logging.info("Process managers")
    managers = utilities.get_master("managers", directory=directory)
    managers.dropna(subset=["Manager"], inplace=True)

    fulldata = pd.merge(fulldata, managers, on="Team", how="left")
    # logging.debug(200000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"])
                         & (fulldata["Date"] <= fulldata["DateTo"]))
                        | (fulldata["Manager"].isnull())]
    # logging.debug(300000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata.drop(
        ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"],
        axis=1,
        inplace=True,
    )
    fulldata = fulldata.drop_duplicates()
    # fulldata.info()
    # logging.debug(400000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    managers.rename(columns={"Team": "TeamOpp"}, inplace=True)
    fulldata = pd.merge(fulldata,
                        managers,
                        on="TeamOpp",
                        how="left",
                        suffixes=("", "Opp"))
    # logging.debug(500000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])
    fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"])
                         & (fulldata["Date"] <= fulldata["DateTo"]))
                        | (fulldata["ManagerOpp"].isnull())]
    # logging.debug(600000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata.drop(
        ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"],
        axis=1,
        inplace=True,
    )
    fulldata = fulldata.drop_duplicates()
    # fulldata.info()
    # logging.debug(700000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    # fulldata.info()
    # logging.debug(fulldata.describe(include="all"))
    # logging.debug(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['Season']=="2006-2007")]["Date"].min()#.describe(include="all"))

    utilities.save_master(fulldata, "fulldata", directory=directory)
    return fulldata
示例#5
0
def get_summary(
        group_key,
        df=None,
        agg_method="mean",
        base_filters={},
        metric_mins={},
        output_metrics=(),
):
    """Generate summarised clubs data.

    INPUT:
        group_key: Field (or Fields) to group data on
        df: (Optional) pass in clubs Datafarme
        agg_method: Aggregation method
        base_filters: Dictionary with Field/Value(s) pairs to filter base data
        metric_mins: Dictionary with Field/Value(s) pairs to filter agg data
        output_metrics: Metric fields to include in output

    OUTPUT:
        df: Aggregated dataframe
    """
    logging.debug("Get summarised data")

    if df is None:
        # fetch from master csv
        df = utilities.get_master("fulldata")
    # logging.debug(list(df.columns.values))

    # filter unwanted records
    # df = df[(df['Team']=="Chelsea")]
    # df = df[(df['Country']=="England")]
    # df = df[(df['Tier']==1)]
    for field, vals in base_filters.items():
        df = df[(df[field].isin(vals))]

    #    selected_columns = [group_key]+metrics
    #    df = df[selected_columns]
    df.dropna(subset=[group_key], inplace=True)

    # aggregate data
    #   df_avg = df[[group_key]+metrics].groupby(group_key).mean()
    df_avg = df.groupby(group_key).agg(agg_method)
    # df_avg.info()
    df_cnt = df[group_key].value_counts()
    # df_cnt.columns = ['NumberOfMatches']
    # logging.debug(df_cnt)
    df = pd.concat([df_cnt, df_avg], axis=1, sort=True)
    df.rename(columns={group_key: "NumberOfMatches"}, inplace=True)
    if "Unnamed: 0" in df.columns:
        df.drop(["Unnamed: 0"], axis=1, inplace=True)

    # add derived metrics
    df["ShotAccuracy"] = df["ShotsOnTarget"] / df["Shots"]
    df["ShotAccuracyOpp"] = df["ShotsOnTargetOpp"] / df["ShotsOpp"]
    df["ShotPercent"] = df["Goals"] / df["ShotsOnTarget"]
    df["ShotPercentOpp"] = df["GoalsOpp"] / df["ShotsOnTargetOpp"]
    df["SavePercent"] = df["Saves"] / df["ShotsOnTargetOpp"]
    df["SavePercentOpp"] = df["SavesOpp"] / df["ShotsOnTarget"]
    df["ShotConversion"] = df["Goals"] / df["Shots"]
    df["ShotConversionOpp"] = df["GoalsOpp"] / df["ShotsOpp"]
    df["TSR"] = df["Shots"] / df["TotalShots"]
    df["TSROpp"] = df["ShotsOpp"] / df["TotalShots"]
    df["ShotOnTargetRatio"] = df["ShotsOnTarget"] / df["TotalShotsOnTarget"]
    df["ShotOnTargetRatioOpp"] = df["ShotsOnTargetOpp"] / df[
        "TotalShotsOnTarget"]
    df["ShotDominance"] = df["Shots"] / df["ShotsOpp"]
    df["ShotPace"] = df["TotalShots"]
    df["PDO"] = 1000 * (df["ShotPercent"] + df["SavePercent"])
    df["PDOOpp"] = 1000 * (df["ShotPercentOpp"] + df["SavePercentOpp"])
    df["%TSoTt"] = df["ShotAccuracy"] + (1 - df["ShotAccuracyOpp"])
    df["%TSoTtOpp"] = df["ShotAccuracyOpp"] + (1 - df["ShotAccuracy"])
    df["GraysonRating"] = ((0.5 + (df["TSR"] - 0.5) * 0.732**0.5) *
                           (1.0 + (df["%TSoTt"] - 1.0) * 0.166**0.5) *
                           (1000 + (df["PDO"] - 1000) * 0.176**0.5))
    df["GraysonRatingOpp"] = ((0.5 + (df["TSROpp"] - 0.5) * 0.732**0.5) *
                              (1.0 + (df["%TSoTtOpp"] - 1.0) * 0.166**0.5) *
                              (1000 + (df["PDOOpp"] - 1000) * 0.176**0.5))
    df["GraysonScore"] = 10 * (df["GraysonRating"] - 363) / (695 - 363)
    df["GraysonScoreOpp"] = 10 * (df["GraysonRatingOpp"] - 363) / (695 - 363)

    # filter unwanted aggregate data
    # df = df[(df["NumberOfMatches"] >= 50)]
    for field, val in metric_mins.items():
        df = df[(df[field] >= val)]

    if output_metrics:
        df = df[output_metrics]
    # df.info()
    logging.debug("Showing summarised dataframe...\n{0}".format(df))
    return df
import os
import numpy as np
import pickle

import matplotlib.pyplot as plt

plt.style.use("seaborn-whitegrid")
import seaborn as sns

sns.set()

import src.utilities as utilities

# In[2]:

match = utilities.get_master("nations_matches")
# match.info()

match = match[[
    'Round', 'Day', 'Date', 'Time', 'Team_1', 'Team_2', 'Year', 'Goals_1',
    'Goals_2', 'Goal_diff', 'Venue', 'Venue_country', 'Venue_city', 'Home_1',
    'Home_2'
]]

match["Goal_total"] = match.Goals_1 + match.Goals_2
match["Result"] = None
match.loc[match.Goals_1 == match.Goals_2, "Result"] = "Draw"
match.loc[match.Goals_1 > match.Goals_2, "Result"] = "Win"
match.loc[match.Goals_1 < match.Goals_2, "Result"] = "Loss"

match.describe(include="all").T
示例#7
0
"""data quality dashboard app.

Used for running data quality dashboard webapp
"""

import dash
import dash_core_components as dcc
import dash_html_components as html
import numpy as np
import pandas as pd
import plotly.graph_objects as go

import src.utilities as utilities

data = utilities.get_master("quality")
overall_score = data.score.mean()
# range_max = data.score.max()

data_pivot = pd.pivot_table(
    data,
    values="score",
    index="file",
    columns="category",
    aggfunc=np.mean,
    margins=False,
)
# print(data_pivot)

fig = go.Figure(
    data=go.Heatmap(
        z=data_pivot.values,
示例#8
0
 def test_get_master(self):
     """Test return dummy dataframe from dummy csv."""
     utilities.save_master(self.testFrame, "dummy", directory=self.testDir)
     utilities.get_master(
         "dummy", directory=self.testDir).shape == self.testFrame.shape
# In[4]:

## visualisation
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")

import seaborn as sns
# sns.set()
sns.set(rc={'figure.figsize': (10, 5)})

# from mpl_toolkits.basemap import Basemap

# In[5]:

df = utilities.get_master("fulldata")

df = df[df.HomeAway == "Home"]

df.dropna(subset=['TotalGoals'], inplace=True)

df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df["Month"] = df["Date"].apply(lambda x: x.strftime("%m"))
df["Day_Of_Week"] = df["Date"].apply(lambda x: x.strftime("%w"))

attrib_cols = [
    "Date", "Month", "Day_Of_Week", "HomeAway", "Season", "Country", "Tier",
    "Team", "TeamOpp", "Manager", "ManagerOpp", "Referee", "Stadium",
    "Latitude", "Longitude"
]
metric_cols = ["TotalGoals"]