def results_analysis( directory=config.MASTER_DIR, buckets=("Div", "Season"), stats=("HS", "AS", "HST", "AST", "FTHG", "FTAG"), filteron="HomeTeam", values=("Barcelona"), aggfunc="mean", ): """Customizable analysis of results. INPUT: directory: Directory containing processed results buckets: List of Fields to group by stats: Aggregated metrics to include filteron: Field to filter on values: List of values to filter on aggfunc: Aggregation method OUTPUT: selected: Analysis dataframe """ logging.info("Results analysis") dframe = utilities.get_master("results", directory) # dframe.info() # dframe.describe(include="all") # print(dframe) # buckets = ['Div','Season'] # stats = 'HS','AS','HST','AST','FTHG','FTAG' # filteron = 'HomeTeam' # values = ['Barcelona'] # aggfunc = 'mean' pseudocode = ( "SELECT " + aggfunc + " OF " + str(stats) + " WHERE " + filteron + " IS " + str(values) + " GROUPED BY " + str(buckets) ) logging.info("Analysis pseudocode: {0}".format(pseudocode)) selected = ( dframe[dframe[filteron].isin(values)].groupby(buckets)[stats].agg(aggfunc) ) # print(selected) logging.info(selected.describe(include="all")) # pd.scatter_matrix(selected, diagonal='kde') # plt.show() return selected
# # In[ ]: # ## 2. Data Understanding # # * Collect Initial Data # * Describe Data # * Explore Data # * Verify Data Quality # In[7]: print("Loading Transfermarkt general information...") tmk_df = utilities.get_master("players_contract") # tmk_df.info() # In[8]: print("Random sample of records...") tmk_df.sample(8, random_state=RANDOM_STATE) # In[9]: print("Summary of whole data source...") tmk_df.describe(include="all") # **ANALYSIS:** So the data is looking broadly in good shape, but there are a few missing values to consider...
def calculate_quality(directory=config.MASTER_DIR): """Calculate data quality. INPUT: directory: Location of data to assess OUTPUT: overall_score: Overall quality score across all files """ logging.info("Calculating data quality") calc_date = datetime.datetime.today().strftime("%Y-%m-%d") logging.info("Current date is {0}".format(calc_date)) dq_data = [] for file in os.listdir(directory): if file == "ftb_quality.txt": continue if not file.endswith(".txt"): continue logging.info("Assessing {0}".format(file)) file_path = os.path.join(directory, file) file_date = datetime.datetime.fromtimestamp( os.path.getmtime(file_path)).strftime("%Y-%m-%d") logging.info("File modification date is {0}".format(calc_date)) file_stub = file.replace("ftb_", "").replace(".txt", "") df = utilities.get_master(file_stub, directory=directory) if df.shape[0] > 50000: df = df.sample(50000, replace=False, random_state=42) no_of_rows, no_of_columns = df.shape no_of_cells = no_of_rows * no_of_columns # Consistency, coherence, or clarity category = "Consistency" logging.info("Running {0} tests".format(category)) test = None if file in ["ftb_events_shot.txt"]: test = "xG between 0 and 1" score = 1 - (df[~df.statsbomb_xg.between(0, 1)].shape[0] / df.shape[0]) elif file in ["ftb_fulldata.txt"]: test = "Goals <= Shots" score = 1 - (df[df["Goals"] > df["Shots"]].shape[0] / df.shape[0]) elif file in ["ftb_managers.txt"]: test = "DateFrom <= DateTo" score = 1 - (df[df.DateFrom > df.DateTo].shape[0] / df.shape[0]) elif file in ["ftb_players_contract.txt"]: test = "Joined <= Contract expires" score = 1 - (df[df["Joined"] > df["Contract expires"]].shape[0] / df.shape[0]) elif file in ["ftb_players_performance.txt"]: test = "Games started <= In squad" score = 1 - (df[df["Games started"] > df["In squad"]].shape[0] / df.shape[0]) elif file in ["ftb_results.txt"]: test = "Home goals <= Home shots" score = 1 - (df[df["FTHG"] > df["HS"]].shape[0] / df.shape[0]) elif file in ["ftb_nations_matches.txt"]: test = "Max one home team" score = 1 - (df[df["Home_1"] + df["Home_2"] == 2].shape[0] / df.shape[0]) if test: dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Completeness or comprehensiveness category = "Completeness" logging.info("Running {0} tests".format(category)) test = "Missing values" score = df.count().sum() / no_of_cells dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Timeliness or latency category = "Timeliness" logging.info("Running {0} tests".format(category)) test = "Days since file updated" score = max( 1 - ((int(calc_date.replace("-", "")) - int(file_date.replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) date_field = None if file in [ "ftb_fulldata.txt", "ftb_results.txt", "ftb_nations_matches.txt" ]: date_field = "Date" elif file in ["ftb_events_shot.txt"]: date_field = "match_date" elif file in ["ftb_managers.txt"]: date_field = "DateTo" if date_field: test = "Days since last match date" score = max( 1 - ((int(calc_date.replace("-", "")) - int(df[date_field].max().replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Accuracy or correctness category = "Accuracy" logging.info("Running {0} tests".format(category)) # Spots tests against ref data? # Wikipedia (Ajax, Frankfurt) # Don Balon (Spain 08/09) # SkySports Football Yearbook (England & Scotland 07/08) # Uniqueness category = "Uniqueness" logging.info("Running {0} tests".format(category)) test = "Duplicated rows" score = df.drop_duplicates().shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "Duplicated columns" score = df.T.drop_duplicates().T.shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Validity or reasonableness category = "Validity" logging.info("Running {0} tests".format(category)) # 0 <= Goals <= 10 test = "3 stdev from mean" score = 1 - ( ((df < (df.mean() - 3 * df.std())) | (df > df.mean() + 3 * df.std())).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "1.5 IQR rule" Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 score = 1 - (((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Orderliness # Auditability # Conformity # accessibility or availability # comparability # credibility, reliability, or reputation # relevance, pertinence, or usefulness df_dq = pd.DataFrame(dq_data) df_dq["score"] = df_dq["score"].clip(lower=0, upper=1) utilities.save_master(df_dq, "quality", directory=directory) overall_score = df_dq.score.mean() logging.info("Overall score is {0}".format(overall_score)) return overall_score
def build_fulldata(directory=config.MASTER_DIR): """Combine results, stadiums and managers data into full dataset for clubs. INPUT: directory: Directory to save output to OUTPUT: fulldata: Dataframe containing all the clubs data """ logging.info( "Building fulldata dataframe from results, stadiums, managers ...") home_renames = { "HomeTeam": "Team", "AwayTeam": "TeamOpp", "FTHG": "Goals", "FTAG": "GoalsOpp", "HTHG": "Goals1stHalf", "HTAG": "Goals1stHalfOpp", "HS": "Shots", "AS": "ShotsOpp", "HST": "ShotsOnTarget", "AST": "ShotsOnTargetOpp", "HHW": "ShotsHitWoodwork", "AHW": "ShotsHitWoodworkOpp", "HC": "Corners", "AC": "CornersOpp", "HF": "Fouls", "AF": "FoulsOpp", "HO": "Offsides", "AO": "OffsidesOpp", "HY": "YellowCards", "AY": "YellowCardsOpp", "HR": "RedCards", "AR": "RedCardsOpp", "HBP": "BookingPoints", "ABP": "BookingPointsOpp", } away_renames = {} for key, val in home_renames.items(): if val.endswith("Opp"): away_renames[key] = val[:-3] else: away_renames[key] = val + "Opp" stat_to_diff = [ "Goals", "Goals1stHalf", "Shots", "ShotsOnTarget", "ShotsHitWoodwork", "Corners", "Fouls", "Offsides", "YellowCards", "RedCards", "BookingPoints", ] # logging.debug(list(away_renames)) # logging.debug(list(home_renames)) logging.info("Process results") results = utilities.get_master("results", directory=directory) homeresults = results.rename(columns=home_renames) homeresults["HomeAway"] = "Home" # homeresults.info() # logging.debug(homeresults.describe(include="all")) awayresults = results.rename(columns=away_renames) awayresults["HomeAway"] = "Away" # awayresults.info() # logging.debug(homeresults.describe(include="all")) allresults = pd.concat([homeresults, awayresults], ignore_index=True, sort=False) allresults.drop(["FTR", "HTR", "Unnamed: 0"], axis=1, inplace=True) # logging.debug(allresults[(allresults['Team']=="Middlesbrough")&(allresults['Season']=="2006-2007")]["Date"].min()) for stat in stat_to_diff: allresults[stat + "Diff"] = allresults[stat] - allresults[stat + "Opp"] allresults["Total" + stat] = allresults[stat] + allresults[stat + "Opp"] allresults[ "Saves"] = allresults["ShotsOnTargetOpp"] - allresults["GoalsOpp"] allresults["SavesOpp"] = allresults["ShotsOnTarget"] - allresults["Goals"] allresults["SavesDiff"] = allresults["Saves"] - allresults["SavesOpp"] allresults[ "Goals2ndHalf"] = allresults["Goals"] - allresults["Goals1stHalf"] allresults["Goals2ndHalfOpp"] = (allresults["GoalsOpp"] - allresults["Goals1stHalfOpp"]) allresults["Goals2ndHalfDiff"] = (allresults["GoalsDiff"] - allresults["Goals1stHalfDiff"]) # Result,Points,PointsOpp,Win,WinDraw,Draw,DrawLoss,Loss,CleanSheet,CleanSheetOpp ( allresults["Result"], allresults["Points"], allresults["PointsOpp"], allresults["Win"], allresults["WinDraw"], allresults["Draw"], allresults["DrawLoss"], allresults["Loss"], allresults["WinShare"], ) = zip(*allresults["GoalsDiff"].map(func_score)) allresults["CleanSheet"] = allresults["Goals"].map(func_nogoal) allresults["CleanSheetOpp"] = allresults["GoalsOpp"].map(func_nogoal) # allresults['Date'] = pd.to_datetime(allresults['Date'], format="%d/%m/%y") allresults["GameWeek"] = (allresults.sort_values("Date").groupby( ["Season", "Div", "Team"]).cumcount() + 1) ## TODO - Validate derived values logging.info("Process stadiums") stadiums = utilities.get_master("stadiums", directory=directory) stadiums.drop(["Country", "TeamFull"], axis=1, inplace=True) fulldata = pd.merge(allresults, stadiums, on="Team", how="left") # fulldata.drop(['Unnamed: 0'], axis=1, inplace=True) stadiums.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, stadiums, on="TeamOpp", how="left", suffixes=("", "Opp")) fulldata.drop(["Unnamed: 0", "Unnamed: 0Opp"], axis=1, inplace=True) fulldata["EuclideanDistance"] = ( (fulldata.Latitude - fulldata.LatitudeOpp)**2 + (fulldata.Longitude - fulldata.LongitudeOpp)**2)**0.5 # logging.debug(100000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) logging.info("Process managers") managers = utilities.get_master("managers", directory=directory) managers.dropna(subset=["Manager"], inplace=True) fulldata = pd.merge(fulldata, managers, on="Team", how="left") # logging.debug(200000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["Manager"].isnull())] # logging.debug(300000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(400000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) managers.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, managers, on="TeamOpp", how="left", suffixes=("", "Opp")) # logging.debug(500000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["ManagerOpp"].isnull())] # logging.debug(600000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(700000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) # fulldata.info() # logging.debug(fulldata.describe(include="all")) # logging.debug(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['Season']=="2006-2007")]["Date"].min()#.describe(include="all")) utilities.save_master(fulldata, "fulldata", directory=directory) return fulldata
def get_summary( group_key, df=None, agg_method="mean", base_filters={}, metric_mins={}, output_metrics=(), ): """Generate summarised clubs data. INPUT: group_key: Field (or Fields) to group data on df: (Optional) pass in clubs Datafarme agg_method: Aggregation method base_filters: Dictionary with Field/Value(s) pairs to filter base data metric_mins: Dictionary with Field/Value(s) pairs to filter agg data output_metrics: Metric fields to include in output OUTPUT: df: Aggregated dataframe """ logging.debug("Get summarised data") if df is None: # fetch from master csv df = utilities.get_master("fulldata") # logging.debug(list(df.columns.values)) # filter unwanted records # df = df[(df['Team']=="Chelsea")] # df = df[(df['Country']=="England")] # df = df[(df['Tier']==1)] for field, vals in base_filters.items(): df = df[(df[field].isin(vals))] # selected_columns = [group_key]+metrics # df = df[selected_columns] df.dropna(subset=[group_key], inplace=True) # aggregate data # df_avg = df[[group_key]+metrics].groupby(group_key).mean() df_avg = df.groupby(group_key).agg(agg_method) # df_avg.info() df_cnt = df[group_key].value_counts() # df_cnt.columns = ['NumberOfMatches'] # logging.debug(df_cnt) df = pd.concat([df_cnt, df_avg], axis=1, sort=True) df.rename(columns={group_key: "NumberOfMatches"}, inplace=True) if "Unnamed: 0" in df.columns: df.drop(["Unnamed: 0"], axis=1, inplace=True) # add derived metrics df["ShotAccuracy"] = df["ShotsOnTarget"] / df["Shots"] df["ShotAccuracyOpp"] = df["ShotsOnTargetOpp"] / df["ShotsOpp"] df["ShotPercent"] = df["Goals"] / df["ShotsOnTarget"] df["ShotPercentOpp"] = df["GoalsOpp"] / df["ShotsOnTargetOpp"] df["SavePercent"] = df["Saves"] / df["ShotsOnTargetOpp"] df["SavePercentOpp"] = df["SavesOpp"] / df["ShotsOnTarget"] df["ShotConversion"] = df["Goals"] / df["Shots"] df["ShotConversionOpp"] = df["GoalsOpp"] / df["ShotsOpp"] df["TSR"] = df["Shots"] / df["TotalShots"] df["TSROpp"] = df["ShotsOpp"] / df["TotalShots"] df["ShotOnTargetRatio"] = df["ShotsOnTarget"] / df["TotalShotsOnTarget"] df["ShotOnTargetRatioOpp"] = df["ShotsOnTargetOpp"] / df[ "TotalShotsOnTarget"] df["ShotDominance"] = df["Shots"] / df["ShotsOpp"] df["ShotPace"] = df["TotalShots"] df["PDO"] = 1000 * (df["ShotPercent"] + df["SavePercent"]) df["PDOOpp"] = 1000 * (df["ShotPercentOpp"] + df["SavePercentOpp"]) df["%TSoTt"] = df["ShotAccuracy"] + (1 - df["ShotAccuracyOpp"]) df["%TSoTtOpp"] = df["ShotAccuracyOpp"] + (1 - df["ShotAccuracy"]) df["GraysonRating"] = ((0.5 + (df["TSR"] - 0.5) * 0.732**0.5) * (1.0 + (df["%TSoTt"] - 1.0) * 0.166**0.5) * (1000 + (df["PDO"] - 1000) * 0.176**0.5)) df["GraysonRatingOpp"] = ((0.5 + (df["TSROpp"] - 0.5) * 0.732**0.5) * (1.0 + (df["%TSoTtOpp"] - 1.0) * 0.166**0.5) * (1000 + (df["PDOOpp"] - 1000) * 0.176**0.5)) df["GraysonScore"] = 10 * (df["GraysonRating"] - 363) / (695 - 363) df["GraysonScoreOpp"] = 10 * (df["GraysonRatingOpp"] - 363) / (695 - 363) # filter unwanted aggregate data # df = df[(df["NumberOfMatches"] >= 50)] for field, val in metric_mins.items(): df = df[(df[field] >= val)] if output_metrics: df = df[output_metrics] # df.info() logging.debug("Showing summarised dataframe...\n{0}".format(df)) return df
import os import numpy as np import pickle import matplotlib.pyplot as plt plt.style.use("seaborn-whitegrid") import seaborn as sns sns.set() import src.utilities as utilities # In[2]: match = utilities.get_master("nations_matches") # match.info() match = match[[ 'Round', 'Day', 'Date', 'Time', 'Team_1', 'Team_2', 'Year', 'Goals_1', 'Goals_2', 'Goal_diff', 'Venue', 'Venue_country', 'Venue_city', 'Home_1', 'Home_2' ]] match["Goal_total"] = match.Goals_1 + match.Goals_2 match["Result"] = None match.loc[match.Goals_1 == match.Goals_2, "Result"] = "Draw" match.loc[match.Goals_1 > match.Goals_2, "Result"] = "Win" match.loc[match.Goals_1 < match.Goals_2, "Result"] = "Loss" match.describe(include="all").T
"""data quality dashboard app. Used for running data quality dashboard webapp """ import dash import dash_core_components as dcc import dash_html_components as html import numpy as np import pandas as pd import plotly.graph_objects as go import src.utilities as utilities data = utilities.get_master("quality") overall_score = data.score.mean() # range_max = data.score.max() data_pivot = pd.pivot_table( data, values="score", index="file", columns="category", aggfunc=np.mean, margins=False, ) # print(data_pivot) fig = go.Figure( data=go.Heatmap( z=data_pivot.values,
def test_get_master(self): """Test return dummy dataframe from dummy csv.""" utilities.save_master(self.testFrame, "dummy", directory=self.testDir) utilities.get_master( "dummy", directory=self.testDir).shape == self.testFrame.shape
# In[4]: ## visualisation get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.style.use("seaborn-whitegrid") import seaborn as sns # sns.set() sns.set(rc={'figure.figsize': (10, 5)}) # from mpl_toolkits.basemap import Basemap # In[5]: df = utilities.get_master("fulldata") df = df[df.HomeAway == "Home"] df.dropna(subset=['TotalGoals'], inplace=True) df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") df["Month"] = df["Date"].apply(lambda x: x.strftime("%m")) df["Day_Of_Week"] = df["Date"].apply(lambda x: x.strftime("%w")) attrib_cols = [ "Date", "Month", "Day_Of_Week", "HomeAway", "Season", "Country", "Tier", "Team", "TeamOpp", "Manager", "ManagerOpp", "Referee", "Stadium", "Latitude", "Longitude" ] metric_cols = ["TotalGoals"]