def build_event_data(competition_id, event_type, directory=config.MASTER_DIR): """Build formatted event data from source files. Args: event_type: String value of event type directory: Folder for processed data Returns: None """ logging.info("Building event data") comps = utilities.folder_loader("stb", "competitions") logging.debug(comps.info()) matches = utilities.folder_loader("stb", "matches") logging.debug(matches.info()) events = utilities.folder_loader("stb", "events", "match_event") logging.debug(events.info()) data = ( comps.loc[ comps.competition_id == competition_id, ["season_id", "country_name", "competition_name", "season_name"], ] .merge( matches.loc[matches.competition == competition_id, ["match_id", "match_date", "kick_off", "season"], ], how="inner", left_on="season_id", right_on="season", ).merge( events.loc[:, [ "event_type", "period", "minute", "team", "player", "statsbomb_xg", "type", "outcome", "start_location_x", "start_location_y", "end_location_x", "end_location_y", "end_location_z", "match_id", ], ], how="inner", on="match_id", )) logging.debug(data.info()) utilities.save_master(data, "events_{0}".format(event_type), directory=directory) return data
def get_outfile(source_name): """Return outfile stub for given source. INPUT: source_name: String containing name of the data source OUTPUT: outfile_stub: Stub to use when saving output """ logging.info("Mapping {0} to outfile".format(source_name)) if source_name == "tmk_cnt": outfile_stub = "players_contract" elif source_name == "tmk_psm": outfile_stub = "players_performance" logging.debug(outfile_stub) return outfile_stub
def download_events(competition_id, event_type): """Download all event data. Args: competition_id: Id of competition event_type: String value of event type Returns: None """ logging.info("Downloading events") logging.debug(competition_id, event_type) comps = pd.read_csv( os.path.join(config.SOURCE_DIR, "stb", "competitions", "competitions_None.csv")) season_ids = comps[comps.competition_id == competition_id].season_id.values for season_id in season_ids: logging.debug(season_id) mats = pd.read_csv( os.path.join( config.SOURCE_DIR, "stb", "matches", "matches_{0}_{1}.csv".format(competition_id, season_id), )) match_ids = mats.match_id.values os.chdir(os.path.join(config.SOURCE_DIR, "stb", "events")) for match_id in match_ids: logging.debug(match_id) sb.Events(event_id=str(match_id)).save_data(event_type=event_type) return
def format_stadiums( dgl_file=config.STADIUMS_SCRAPE["dgl"][1], ops_file=config.STADIUMS_SCRAPE["ops"][1], directoryOut=config.MASTER_DIR, ): """Format stadiums data. INPUT: dgl_file: Path for "dgl" stadiums file ops_file: Path for "ops" stadiums file directoryOut: Direcory to save formatted data to OUTPUT: None """ logging.info("Formatting stadiums") # dgl_file = config.STADIUMS_SCRAPE["dgl"][1] logging.info("Parsing: {0}".format(dgl_file)) dgl = pd.read_csv(dgl_file, encoding="utf8", sep=",") dgl.rename(columns={"Name": "Stadium"}, inplace=True) dgl.set_index("Team", inplace=True) logging.debug("\n{0}".format(dgl)) # ops_file = config.STADIUMS_SCRAPE["ops"][1] logging.info("Parsing: {0}".format(ops_file)) ops = pd.read_csv(ops_file, encoding="utf8", sep=",") ops.rename(columns={"Team": "TeamFull", "FDCOUK": "Team"}, inplace=True) ops.set_index("Team", inplace=True) logging.debug("\n{0}".format(ops)) ## TODO - fuzzy matching teams? (name inconsistencies?) logging.info("Create combined stadiums data") # combo = pd.merge(dgl, ops, left_on='Team', right_on='FDCOUK', how='inner') combo = ops.combine_first(dgl) combo.reset_index(level=0, inplace=True) logging.debug("\n{0}".format(combo)) utilities.save_master(combo, "stadiums", directory=directoryOut) return
def format_matches( directoryOut=config.MASTER_DIR, ): """Format national team match data. INPUT: directoryOut: Directory to save formatted data to OUTPUT: match: National match data dataframe """ logging.info("Formatting national team match data") comp = utilities.folder_loader( "fbr", "competition", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "Score", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp2 = utilities.folder_loader( "fbr", "competition2", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "xG_1", "Score", "xG_2", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp = pd.concat([comp, comp2], axis=0, sort=False, ignore_index=True) comp.dropna(subset=["Round"], inplace=True) comp.reset_index(drop=True, inplace=True) comp["Year"] = comp.Date.str[:4] comp["Team_abbrev_1"] = comp["Team_1"].str[-3:].str.strip() comp["Team_1"] = comp["Team_1"].str[:-3].str.strip() comp["Team_abbrev_2"] = comp["Team_2"].str[:3].str.strip() comp["Team_2"] = comp["Team_2"].str[3:].str.strip() comp["Goals_1"] = comp.Score.str.extract(pat="(?:^|\) )([0-9]{1,2})[^0-9]+[0-9]{1,2}") comp["Goals_2"] = comp.Score.str.extract(pat="[0-9]{1,2}[^0-9]+([0-9]{1,2})(?:$| \()") for i in range(1, 3): comp["Goals_" + str(i)] = pd.to_numeric( comp["Goals_" + str(i)], errors="coerce" ) comp["Goal_diff"] = comp.Goals_1 - comp.Goals_2 logging.debug("\n{0}".format(comp.info())) venue = pd.read_csv( os.path.join(config.SOURCE_DIR, "wkp", "wkp_std", "wkp_std_nat.csv"), encoding="latin9", sep=",", ) venue.columns = ["Venue_country", "Venue_city", "Venue", "Venue_URL"] logging.debug("\n{0}".format(venue.info())) match = pd.merge(comp, venue, on="Venue", how="left") ## workaround for venues that aren't mapping match.loc[match.Venue == "Stadion Energa Gdańsk", "Venue_country"] = "Poland" match.loc[match.Venue == "Bakı Olimpiya Stadionu", "Venue_country"] = "Azerbaijan" match.loc[match.Venue == "Arena Naţională", "Venue_country"] = "Romania" for i in range(1, 3): match["Home_" + str(i)] = 0 match.loc[match["Team_" + str(i)] == match.Venue_country, "Home_" + str(i)] = 1 logging.debug("\n{0}".format(match.info())) utilities.save_master(match, "nations_matches", directory=directoryOut) return match
def get_summary( group_key, df=None, agg_method="mean", base_filters={}, metric_mins={}, output_metrics=(), ): """Generate summarised clubs data. INPUT: group_key: Field (or Fields) to group data on df: (Optional) pass in clubs Datafarme agg_method: Aggregation method base_filters: Dictionary with Field/Value(s) pairs to filter base data metric_mins: Dictionary with Field/Value(s) pairs to filter agg data output_metrics: Metric fields to include in output OUTPUT: df: Aggregated dataframe """ logging.debug("Get summarised data") if df is None: # fetch from master csv df = utilities.get_master("fulldata") # logging.debug(list(df.columns.values)) # filter unwanted records # df = df[(df['Team']=="Chelsea")] # df = df[(df['Country']=="England")] # df = df[(df['Tier']==1)] for field, vals in base_filters.items(): df = df[(df[field].isin(vals))] # selected_columns = [group_key]+metrics # df = df[selected_columns] df.dropna(subset=[group_key], inplace=True) # aggregate data # df_avg = df[[group_key]+metrics].groupby(group_key).mean() df_avg = df.groupby(group_key).agg(agg_method) # df_avg.info() df_cnt = df[group_key].value_counts() # df_cnt.columns = ['NumberOfMatches'] # logging.debug(df_cnt) df = pd.concat([df_cnt, df_avg], axis=1, sort=True) df.rename(columns={group_key: "NumberOfMatches"}, inplace=True) if "Unnamed: 0" in df.columns: df.drop(["Unnamed: 0"], axis=1, inplace=True) # add derived metrics df["ShotAccuracy"] = df["ShotsOnTarget"] / df["Shots"] df["ShotAccuracyOpp"] = df["ShotsOnTargetOpp"] / df["ShotsOpp"] df["ShotPercent"] = df["Goals"] / df["ShotsOnTarget"] df["ShotPercentOpp"] = df["GoalsOpp"] / df["ShotsOnTargetOpp"] df["SavePercent"] = df["Saves"] / df["ShotsOnTargetOpp"] df["SavePercentOpp"] = df["SavesOpp"] / df["ShotsOnTarget"] df["ShotConversion"] = df["Goals"] / df["Shots"] df["ShotConversionOpp"] = df["GoalsOpp"] / df["ShotsOpp"] df["TSR"] = df["Shots"] / df["TotalShots"] df["TSROpp"] = df["ShotsOpp"] / df["TotalShots"] df["ShotOnTargetRatio"] = df["ShotsOnTarget"] / df["TotalShotsOnTarget"] df["ShotOnTargetRatioOpp"] = df["ShotsOnTargetOpp"] / df[ "TotalShotsOnTarget"] df["ShotDominance"] = df["Shots"] / df["ShotsOpp"] df["ShotPace"] = df["TotalShots"] df["PDO"] = 1000 * (df["ShotPercent"] + df["SavePercent"]) df["PDOOpp"] = 1000 * (df["ShotPercentOpp"] + df["SavePercentOpp"]) df["%TSoTt"] = df["ShotAccuracy"] + (1 - df["ShotAccuracyOpp"]) df["%TSoTtOpp"] = df["ShotAccuracyOpp"] + (1 - df["ShotAccuracy"]) df["GraysonRating"] = ((0.5 + (df["TSR"] - 0.5) * 0.732**0.5) * (1.0 + (df["%TSoTt"] - 1.0) * 0.166**0.5) * (1000 + (df["PDO"] - 1000) * 0.176**0.5)) df["GraysonRatingOpp"] = ((0.5 + (df["TSROpp"] - 0.5) * 0.732**0.5) * (1.0 + (df["%TSoTtOpp"] - 1.0) * 0.166**0.5) * (1000 + (df["PDOOpp"] - 1000) * 0.176**0.5)) df["GraysonScore"] = 10 * (df["GraysonRating"] - 363) / (695 - 363) df["GraysonScoreOpp"] = 10 * (df["GraysonRatingOpp"] - 363) / (695 - 363) # filter unwanted aggregate data # df = df[(df["NumberOfMatches"] >= 50)] for field, val in metric_mins.items(): df = df[(df[field] >= val)] if output_metrics: df = df[output_metrics] # df.info() logging.debug("Showing summarised dataframe...\n{0}".format(df)) return df
def format_results( parentDirectory=config.SOURCE_DIR, subDirectory=config.RESULTS_SCRAPE["ftd"][1], directoryOut=config.MASTER_DIR, ): """Format raw results and save processed output. INPUT: parentDirectory: Parent directory to traverse looking for files to zip/clear subDirectory: Sub-Directory to traverse looking for files to zip/clear directoryOut: Directory to save output to OUTPUT: None """ directoryIn = os.path.join(parentDirectory, subDirectory) logging.info("Format results in {0}".format(directoryIn)) pieces = [] core_cols = ["Div", "Date"] # ,'HomeTeam','AwayTeam','FTHG','FTAG','FTR'] use_cols = [ "Season", "Div", "Country", "Tier", "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR", "Attendance", "Referee", "HS", "AS", "HST", "AST", "HHW", "AHW", "HC", "AC", "HF", "AF", "HO", "AO", "HY", "AY", "HR", "AR", "HBP", "ABP", ] for root, _dirs, files in os.walk(directoryIn): for file in files: if file.endswith(".csv"): # logging.info(root) filepath = os.path.join(root, file) logging.info("Filepath: {0}".format(filepath)) # logging.info(root[-9:]) # try: df = pd.read_csv( filepath, error_bad_lines=False, warn_bad_lines=False, encoding="latin9", ) # , parse_dates=['Date']) logging.debug("Input columns: {0}".format(df.columns)) # df['File'] = file df["Season"] = root[-9:] if set(["HomeTeam", "AwayTeam"]).issubset(df.columns): # logging.info(df[["HomeTeam", "AwayTeam"]].head()) try: df["HomeTeam"] = df[ "HomeTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AwayTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan elif set(["HT", "AT"]).issubset(df.columns): # logging.info(df[["HT", "AT"]].head()) try: df["HomeTeam"] = df[ "HT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan else: raise # logging.info(df[["HomeTeam", "AwayTeam"]].head()) # drop useless rows df = df.dropna(subset=core_cols) logging.debug("Output columns: {0}".format(df.columns)) pieces.append(df) # except: # logging.info("read_csv FAILED: "+os.path.join(root, file)) # logging.info(df.count()) logging.info("Concatenate everything into a single DataFrame") dframe = pd.concat(pieces, ignore_index=True, sort=False) dframe["Country"], dframe["Tier"] = zip(*dframe["Div"].map(func_div)) # dframe["Date"] = pd.to_datetime(dframe['Date'], format='%d/%m/%y') dframe.Date = pd.to_datetime(dframe.Date, dayfirst=True) logging.info(dframe[use_cols].info()) # logging.info(dframe[((dframe['HomeTeam']=="Middlesbrough")|(dframe['AwayTeam']=="Middlesbrough"))&(dframe['Season']=="2006-2007")][["Date", "HomeTeam", "AwayTeam"]]) utilities.save_master( dframe[use_cols], "results", directory=directoryOut ) # , enc="ascii")
def clean_data(source_name, directory=config.MASTER_DIR): """Clean raw player data and save processed version. INPUT: source_name: String containing name of the data source directory: Directory to save output to OUTPUT: df: Dataframe containing the cleaned data """ logging.info("Loading {0} data".format(source_name)) if source_name == "tmk_cnt": source_header = [ "Shirt number", "Position", "Name", "Date of birth", "Nationality", "Height", "Foot", "Joined", "Signed from", "Contract expires", "Market value", ] drop_cols = ["Nationality", "Signed from", "Competition"] notna_cols = ["Market value"] elif source_name == "tmk_psm": source_header = [ "Shirt number", "Position", "Name", "Age", "Nationality", "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ] drop_cols = ["Nationality"] notna_cols = ["In squad"] df = utilities.folder_loader(source_name[:3], source_name, "comp_season", source_header=source_header) ## Name and Position are mis-aligned in the source files df["Name"].fillna(method="bfill", inplace=True) df["Position"] = df.Name.shift(-1) df.loc[df.Position == df.Name, "Position"] = df.Name.shift(-2) df.drop(axis=1, columns=drop_cols, inplace=True) df.dropna(subset=notna_cols, inplace=True) df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) df = df.replace("-", np.nan) df = df.replace("Was not used during this season", np.nan) df = df.replace("Not in squad during this season", np.nan) df = df.replace("Not used during this season", np.nan) df["Shirt number"] = pd.to_numeric(df["Shirt number"], downcast="integer") df["Position group"] = None df.loc[(df.Position.str.upper().str.contains("KEEPER")) | (df.Position.str.upper().str.contains("GOAL")), "Position group", ] = "G" df.loc[(df.Position.str.upper().str.contains("BACK")) | (df.Position.str.upper().str.contains("DEF")), "Position group", ] = "D" df.loc[(df.Position.str.upper().str.contains("MID")) | (df.Position.str.upper().str.contains("MIT")) | (df.Position.str.upper().str.contains("WING")), "Position group", ] = "M" df.loc[(df.Position.str.upper().str.contains("STRIKER")) | (df.Position.str.upper().str.contains("FORW")), "Position group", ] = "F" if source_name == "tmk_cnt": df["Age"] = (df["Date of birth"].str.extract( r".*([0-9]{2})", expand=False).astype("int")) df["Date of birth"] = pd.to_datetime( df["Date of birth"].str.extract(r"(.*) \([0-9]{2}\)", expand=False), format="%b %d, %Y", ) df["Joined"] = pd.to_datetime(df.Joined, format="%b %d, %Y") df["Contract expires"] = pd.to_datetime(df["Contract expires"], format="%d.%m.%Y") df["Height"] = (df["Height"].str.strip().str.replace( " ", "").str.replace(",", "").str.replace("m", "").replace({ "-": np.nan, "": np.nan }).astype(float)) df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values), "Height", ] = ( df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values)]. sort_values(by=["Name", "Season"]).Height.fillna( method="bfill")) df.loc[df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values), "Foot", ] = (df.loc[ df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values)].sort_values( by=["Name", "Season"]).Foot.fillna(method="bfill")) df["Market value"] = ( df["Market value"].str.strip().replace({ "-": np.nan }).replace(r"[£kmTh\.]", "", regex=True).astype(float) * df["Market value"].str.extract( r"[\d\.]+([kmTh\.]+)", expand=False).fillna(1).replace( ["k", "Th.", "m"], [10**3, 10**3, 10**6]).astype(int) / 10**6) elif source_name == "tmk_psm": df["PPG"] = df["PPG"].str.strip().replace(r"[,]", ".", regex=True).astype(float) df["Minutes played"] = (df["Minutes played"].str.strip().replace( r"[.\']", "", regex=True).astype(float)) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].fillna(0) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].astype(float) logging.debug(df.describe(include="all")) logging.info("Saving processed data to ") utilities.save_master(df, get_outfile(source_name), directory=directory) return df