def build_event_data(competition_id, event_type, directory=config.MASTER_DIR): """Build formatted event data from source files. Args: event_type: String value of event type directory: Folder for processed data Returns: None """ logging.info("Building event data") comps = utilities.folder_loader("stb", "competitions") logging.debug(comps.info()) matches = utilities.folder_loader("stb", "matches") logging.debug(matches.info()) events = utilities.folder_loader("stb", "events", "match_event") logging.debug(events.info()) data = ( comps.loc[ comps.competition_id == competition_id, ["season_id", "country_name", "competition_name", "season_name"], ] .merge( matches.loc[matches.competition == competition_id, ["match_id", "match_date", "kick_off", "season"], ], how="inner", left_on="season_id", right_on="season", ).merge( events.loc[:, [ "event_type", "period", "minute", "team", "player", "statsbomb_xg", "type", "outcome", "start_location_x", "start_location_y", "end_location_x", "end_location_y", "end_location_z", "match_id", ], ], how="inner", on="match_id", )) logging.debug(data.info()) utilities.save_master(data, "events_{0}".format(event_type), directory=directory) return data
def format_stadiums( dgl_file=config.STADIUMS_SCRAPE["dgl"][1], ops_file=config.STADIUMS_SCRAPE["ops"][1], directoryOut=config.MASTER_DIR, ): """Format stadiums data. INPUT: dgl_file: Path for "dgl" stadiums file ops_file: Path for "ops" stadiums file directoryOut: Direcory to save formatted data to OUTPUT: None """ logging.info("Formatting stadiums") # dgl_file = config.STADIUMS_SCRAPE["dgl"][1] logging.info("Parsing: {0}".format(dgl_file)) dgl = pd.read_csv(dgl_file, encoding="utf8", sep=",") dgl.rename(columns={"Name": "Stadium"}, inplace=True) dgl.set_index("Team", inplace=True) logging.debug("\n{0}".format(dgl)) # ops_file = config.STADIUMS_SCRAPE["ops"][1] logging.info("Parsing: {0}".format(ops_file)) ops = pd.read_csv(ops_file, encoding="utf8", sep=",") ops.rename(columns={"Team": "TeamFull", "FDCOUK": "Team"}, inplace=True) ops.set_index("Team", inplace=True) logging.debug("\n{0}".format(ops)) ## TODO - fuzzy matching teams? (name inconsistencies?) logging.info("Create combined stadiums data") # combo = pd.merge(dgl, ops, left_on='Team', right_on='FDCOUK', how='inner') combo = ops.combine_first(dgl) combo.reset_index(level=0, inplace=True) logging.debug("\n{0}".format(combo)) utilities.save_master(combo, "stadiums", directory=directoryOut) return
def format_matches( directoryOut=config.MASTER_DIR, ): """Format national team match data. INPUT: directoryOut: Directory to save formatted data to OUTPUT: match: National match data dataframe """ logging.info("Formatting national team match data") comp = utilities.folder_loader( "fbr", "competition", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "Score", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp2 = utilities.folder_loader( "fbr", "competition2", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "xG_1", "Score", "xG_2", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp = pd.concat([comp, comp2], axis=0, sort=False, ignore_index=True) comp.dropna(subset=["Round"], inplace=True) comp.reset_index(drop=True, inplace=True) comp["Year"] = comp.Date.str[:4] comp["Team_abbrev_1"] = comp["Team_1"].str[-3:].str.strip() comp["Team_1"] = comp["Team_1"].str[:-3].str.strip() comp["Team_abbrev_2"] = comp["Team_2"].str[:3].str.strip() comp["Team_2"] = comp["Team_2"].str[3:].str.strip() comp["Goals_1"] = comp.Score.str.extract(pat="(?:^|\) )([0-9]{1,2})[^0-9]+[0-9]{1,2}") comp["Goals_2"] = comp.Score.str.extract(pat="[0-9]{1,2}[^0-9]+([0-9]{1,2})(?:$| \()") for i in range(1, 3): comp["Goals_" + str(i)] = pd.to_numeric( comp["Goals_" + str(i)], errors="coerce" ) comp["Goal_diff"] = comp.Goals_1 - comp.Goals_2 logging.debug("\n{0}".format(comp.info())) venue = pd.read_csv( os.path.join(config.SOURCE_DIR, "wkp", "wkp_std", "wkp_std_nat.csv"), encoding="latin9", sep=",", ) venue.columns = ["Venue_country", "Venue_city", "Venue", "Venue_URL"] logging.debug("\n{0}".format(venue.info())) match = pd.merge(comp, venue, on="Venue", how="left") ## workaround for venues that aren't mapping match.loc[match.Venue == "Stadion Energa Gdańsk", "Venue_country"] = "Poland" match.loc[match.Venue == "Bakı Olimpiya Stadionu", "Venue_country"] = "Azerbaijan" match.loc[match.Venue == "Arena Naţională", "Venue_country"] = "Romania" for i in range(1, 3): match["Home_" + str(i)] = 0 match.loc[match["Team_" + str(i)] == match.Venue_country, "Home_" + str(i)] = 1 logging.debug("\n{0}".format(match.info())) utilities.save_master(match, "nations_matches", directory=directoryOut) return match
def format_summaries( directoryOut=config.MASTER_DIR, ): """Format national statistical summary data. INPUT: directoryOut: Directory to save formatted data to OUTPUT: None """ logging.info("Formatting national statistical summary data") elo_list = [] for file in os.listdir(os.path.join(config.SOURCE_DIR, "elo")): if not file.endswith(".json"): continue df_json = pd.read_json(os.path.join(config.SOURCE_DIR, "elo", file)) for i in df_json.index.values: df_tmp = pd.DataFrame(df_json.Team[i], index=[i]) for col in df_tmp.columns.drop("Team"): df_tmp[col] = ( df_tmp[col].str.replace("−", "-").str.replace("+", "").astype(int) ) df_tmp["Filename"] = file df_tmp["Year"] = int(file[:4]) elo_list.append(df_tmp) # print(len(elo_list)) elo = pd.concat(elo_list, ignore_index=True) elo.loc[elo.Team == "Czechia", "Team"] = "Czech Republic" elo.loc[elo.Team == "Yugoslavia", "Team"] = "Serbia" elo.loc[elo.Team == "Ireland", "Team"] = "Republic of Ireland" elo["Country"] = elo.Team elo.loc[ elo.Team.isin(["England", "Scotland", "Wales", "Northern Ireland"]), "Country" ] = "United Kingdom" elo.loc[(elo.Team == "Russia"), "Country"] = "Russian Federation" # elo.describe().T penn = pd.read_excel( os.path.join(config.SOURCE_DIR, "rug", "pwt100.xlsx"), sheet_name="Data" ) penn = penn[["country", "year", "rgdpe", "pop"]] penn.columns = ["Country", "Data Year", "GDP (PPP)", "Population"] penn.dropna(axis="index", inplace=True) penn.sort_values(by=["Data Year"], inplace=True) penn.loc[penn.Country == "Ireland", "Country"] = "Republic of Ireland" # penn.info() summary = pd.merge_asof( elo, penn, left_by="Country", right_by="Country", left_on="Year", right_on="Data Year", tolerance=4, allow_exact_matches=False, ) # summary.info() utilities.save_master(summary, "nations_summaries", directory=directoryOut) return summary
def calculate_quality(directory=config.MASTER_DIR): """Calculate data quality. INPUT: directory: Location of data to assess OUTPUT: overall_score: Overall quality score across all files """ logging.info("Calculating data quality") calc_date = datetime.datetime.today().strftime("%Y-%m-%d") logging.info("Current date is {0}".format(calc_date)) dq_data = [] for file in os.listdir(directory): if file == "ftb_quality.txt": continue if not file.endswith(".txt"): continue logging.info("Assessing {0}".format(file)) file_path = os.path.join(directory, file) file_date = datetime.datetime.fromtimestamp( os.path.getmtime(file_path)).strftime("%Y-%m-%d") logging.info("File modification date is {0}".format(calc_date)) file_stub = file.replace("ftb_", "").replace(".txt", "") df = utilities.get_master(file_stub, directory=directory) if df.shape[0] > 50000: df = df.sample(50000, replace=False, random_state=42) no_of_rows, no_of_columns = df.shape no_of_cells = no_of_rows * no_of_columns # Consistency, coherence, or clarity category = "Consistency" logging.info("Running {0} tests".format(category)) test = None if file in ["ftb_events_shot.txt"]: test = "xG between 0 and 1" score = 1 - (df[~df.statsbomb_xg.between(0, 1)].shape[0] / df.shape[0]) elif file in ["ftb_fulldata.txt"]: test = "Goals <= Shots" score = 1 - (df[df["Goals"] > df["Shots"]].shape[0] / df.shape[0]) elif file in ["ftb_managers.txt"]: test = "DateFrom <= DateTo" score = 1 - (df[df.DateFrom > df.DateTo].shape[0] / df.shape[0]) elif file in ["ftb_players_contract.txt"]: test = "Joined <= Contract expires" score = 1 - (df[df["Joined"] > df["Contract expires"]].shape[0] / df.shape[0]) elif file in ["ftb_players_performance.txt"]: test = "Games started <= In squad" score = 1 - (df[df["Games started"] > df["In squad"]].shape[0] / df.shape[0]) elif file in ["ftb_results.txt"]: test = "Home goals <= Home shots" score = 1 - (df[df["FTHG"] > df["HS"]].shape[0] / df.shape[0]) elif file in ["ftb_nations_matches.txt"]: test = "Max one home team" score = 1 - (df[df["Home_1"] + df["Home_2"] == 2].shape[0] / df.shape[0]) if test: dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Completeness or comprehensiveness category = "Completeness" logging.info("Running {0} tests".format(category)) test = "Missing values" score = df.count().sum() / no_of_cells dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Timeliness or latency category = "Timeliness" logging.info("Running {0} tests".format(category)) test = "Days since file updated" score = max( 1 - ((int(calc_date.replace("-", "")) - int(file_date.replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) date_field = None if file in [ "ftb_fulldata.txt", "ftb_results.txt", "ftb_nations_matches.txt" ]: date_field = "Date" elif file in ["ftb_events_shot.txt"]: date_field = "match_date" elif file in ["ftb_managers.txt"]: date_field = "DateTo" if date_field: test = "Days since last match date" score = max( 1 - ((int(calc_date.replace("-", "")) - int(df[date_field].max().replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Accuracy or correctness category = "Accuracy" logging.info("Running {0} tests".format(category)) # Spots tests against ref data? # Wikipedia (Ajax, Frankfurt) # Don Balon (Spain 08/09) # SkySports Football Yearbook (England & Scotland 07/08) # Uniqueness category = "Uniqueness" logging.info("Running {0} tests".format(category)) test = "Duplicated rows" score = df.drop_duplicates().shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "Duplicated columns" score = df.T.drop_duplicates().T.shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Validity or reasonableness category = "Validity" logging.info("Running {0} tests".format(category)) # 0 <= Goals <= 10 test = "3 stdev from mean" score = 1 - ( ((df < (df.mean() - 3 * df.std())) | (df > df.mean() + 3 * df.std())).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "1.5 IQR rule" Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 score = 1 - (((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Orderliness # Auditability # Conformity # accessibility or availability # comparability # credibility, reliability, or reputation # relevance, pertinence, or usefulness df_dq = pd.DataFrame(dq_data) df_dq["score"] = df_dq["score"].clip(lower=0, upper=1) utilities.save_master(df_dq, "quality", directory=directory) overall_score = df_dq.score.mean() logging.info("Overall score is {0}".format(overall_score)) return overall_score
def build_fulldata(directory=config.MASTER_DIR): """Combine results, stadiums and managers data into full dataset for clubs. INPUT: directory: Directory to save output to OUTPUT: fulldata: Dataframe containing all the clubs data """ logging.info( "Building fulldata dataframe from results, stadiums, managers ...") home_renames = { "HomeTeam": "Team", "AwayTeam": "TeamOpp", "FTHG": "Goals", "FTAG": "GoalsOpp", "HTHG": "Goals1stHalf", "HTAG": "Goals1stHalfOpp", "HS": "Shots", "AS": "ShotsOpp", "HST": "ShotsOnTarget", "AST": "ShotsOnTargetOpp", "HHW": "ShotsHitWoodwork", "AHW": "ShotsHitWoodworkOpp", "HC": "Corners", "AC": "CornersOpp", "HF": "Fouls", "AF": "FoulsOpp", "HO": "Offsides", "AO": "OffsidesOpp", "HY": "YellowCards", "AY": "YellowCardsOpp", "HR": "RedCards", "AR": "RedCardsOpp", "HBP": "BookingPoints", "ABP": "BookingPointsOpp", } away_renames = {} for key, val in home_renames.items(): if val.endswith("Opp"): away_renames[key] = val[:-3] else: away_renames[key] = val + "Opp" stat_to_diff = [ "Goals", "Goals1stHalf", "Shots", "ShotsOnTarget", "ShotsHitWoodwork", "Corners", "Fouls", "Offsides", "YellowCards", "RedCards", "BookingPoints", ] # logging.debug(list(away_renames)) # logging.debug(list(home_renames)) logging.info("Process results") results = utilities.get_master("results", directory=directory) homeresults = results.rename(columns=home_renames) homeresults["HomeAway"] = "Home" # homeresults.info() # logging.debug(homeresults.describe(include="all")) awayresults = results.rename(columns=away_renames) awayresults["HomeAway"] = "Away" # awayresults.info() # logging.debug(homeresults.describe(include="all")) allresults = pd.concat([homeresults, awayresults], ignore_index=True, sort=False) allresults.drop(["FTR", "HTR", "Unnamed: 0"], axis=1, inplace=True) # logging.debug(allresults[(allresults['Team']=="Middlesbrough")&(allresults['Season']=="2006-2007")]["Date"].min()) for stat in stat_to_diff: allresults[stat + "Diff"] = allresults[stat] - allresults[stat + "Opp"] allresults["Total" + stat] = allresults[stat] + allresults[stat + "Opp"] allresults[ "Saves"] = allresults["ShotsOnTargetOpp"] - allresults["GoalsOpp"] allresults["SavesOpp"] = allresults["ShotsOnTarget"] - allresults["Goals"] allresults["SavesDiff"] = allresults["Saves"] - allresults["SavesOpp"] allresults[ "Goals2ndHalf"] = allresults["Goals"] - allresults["Goals1stHalf"] allresults["Goals2ndHalfOpp"] = (allresults["GoalsOpp"] - allresults["Goals1stHalfOpp"]) allresults["Goals2ndHalfDiff"] = (allresults["GoalsDiff"] - allresults["Goals1stHalfDiff"]) # Result,Points,PointsOpp,Win,WinDraw,Draw,DrawLoss,Loss,CleanSheet,CleanSheetOpp ( allresults["Result"], allresults["Points"], allresults["PointsOpp"], allresults["Win"], allresults["WinDraw"], allresults["Draw"], allresults["DrawLoss"], allresults["Loss"], allresults["WinShare"], ) = zip(*allresults["GoalsDiff"].map(func_score)) allresults["CleanSheet"] = allresults["Goals"].map(func_nogoal) allresults["CleanSheetOpp"] = allresults["GoalsOpp"].map(func_nogoal) # allresults['Date'] = pd.to_datetime(allresults['Date'], format="%d/%m/%y") allresults["GameWeek"] = (allresults.sort_values("Date").groupby( ["Season", "Div", "Team"]).cumcount() + 1) ## TODO - Validate derived values logging.info("Process stadiums") stadiums = utilities.get_master("stadiums", directory=directory) stadiums.drop(["Country", "TeamFull"], axis=1, inplace=True) fulldata = pd.merge(allresults, stadiums, on="Team", how="left") # fulldata.drop(['Unnamed: 0'], axis=1, inplace=True) stadiums.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, stadiums, on="TeamOpp", how="left", suffixes=("", "Opp")) fulldata.drop(["Unnamed: 0", "Unnamed: 0Opp"], axis=1, inplace=True) fulldata["EuclideanDistance"] = ( (fulldata.Latitude - fulldata.LatitudeOpp)**2 + (fulldata.Longitude - fulldata.LongitudeOpp)**2)**0.5 # logging.debug(100000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) logging.info("Process managers") managers = utilities.get_master("managers", directory=directory) managers.dropna(subset=["Manager"], inplace=True) fulldata = pd.merge(fulldata, managers, on="Team", how="left") # logging.debug(200000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["Manager"].isnull())] # logging.debug(300000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(400000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) managers.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, managers, on="TeamOpp", how="left", suffixes=("", "Opp")) # logging.debug(500000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["ManagerOpp"].isnull())] # logging.debug(600000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(700000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) # fulldata.info() # logging.debug(fulldata.describe(include="all")) # logging.debug(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['Season']=="2006-2007")]["Date"].min()#.describe(include="all")) utilities.save_master(fulldata, "fulldata", directory=directory) return fulldata
def format_results( parentDirectory=config.SOURCE_DIR, subDirectory=config.RESULTS_SCRAPE["ftd"][1], directoryOut=config.MASTER_DIR, ): """Format raw results and save processed output. INPUT: parentDirectory: Parent directory to traverse looking for files to zip/clear subDirectory: Sub-Directory to traverse looking for files to zip/clear directoryOut: Directory to save output to OUTPUT: None """ directoryIn = os.path.join(parentDirectory, subDirectory) logging.info("Format results in {0}".format(directoryIn)) pieces = [] core_cols = ["Div", "Date"] # ,'HomeTeam','AwayTeam','FTHG','FTAG','FTR'] use_cols = [ "Season", "Div", "Country", "Tier", "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR", "Attendance", "Referee", "HS", "AS", "HST", "AST", "HHW", "AHW", "HC", "AC", "HF", "AF", "HO", "AO", "HY", "AY", "HR", "AR", "HBP", "ABP", ] for root, _dirs, files in os.walk(directoryIn): for file in files: if file.endswith(".csv"): # logging.info(root) filepath = os.path.join(root, file) logging.info("Filepath: {0}".format(filepath)) # logging.info(root[-9:]) # try: df = pd.read_csv( filepath, error_bad_lines=False, warn_bad_lines=False, encoding="latin9", ) # , parse_dates=['Date']) logging.debug("Input columns: {0}".format(df.columns)) # df['File'] = file df["Season"] = root[-9:] if set(["HomeTeam", "AwayTeam"]).issubset(df.columns): # logging.info(df[["HomeTeam", "AwayTeam"]].head()) try: df["HomeTeam"] = df[ "HomeTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AwayTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan elif set(["HT", "AT"]).issubset(df.columns): # logging.info(df[["HT", "AT"]].head()) try: df["HomeTeam"] = df[ "HT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan else: raise # logging.info(df[["HomeTeam", "AwayTeam"]].head()) # drop useless rows df = df.dropna(subset=core_cols) logging.debug("Output columns: {0}".format(df.columns)) pieces.append(df) # except: # logging.info("read_csv FAILED: "+os.path.join(root, file)) # logging.info(df.count()) logging.info("Concatenate everything into a single DataFrame") dframe = pd.concat(pieces, ignore_index=True, sort=False) dframe["Country"], dframe["Tier"] = zip(*dframe["Div"].map(func_div)) # dframe["Date"] = pd.to_datetime(dframe['Date'], format='%d/%m/%y') dframe.Date = pd.to_datetime(dframe.Date, dayfirst=True) logging.info(dframe[use_cols].info()) # logging.info(dframe[((dframe['HomeTeam']=="Middlesbrough")|(dframe['AwayTeam']=="Middlesbrough"))&(dframe['Season']=="2006-2007")][["Date", "HomeTeam", "AwayTeam"]]) utilities.save_master( dframe[use_cols], "results", directory=directoryOut ) # , enc="ascii")
def test_save_master(self): """Test save dummy dataframe to dummy csv.""" assert (utilities.save_master( self.testFrame, "dummy", directory=self.testDir) == self.testMaster)
def test_get_master(self): """Test return dummy dataframe from dummy csv.""" utilities.save_master(self.testFrame, "dummy", directory=self.testDir) utilities.get_master( "dummy", directory=self.testDir).shape == self.testFrame.shape
def clean_data(source_name, directory=config.MASTER_DIR): """Clean raw player data and save processed version. INPUT: source_name: String containing name of the data source directory: Directory to save output to OUTPUT: df: Dataframe containing the cleaned data """ logging.info("Loading {0} data".format(source_name)) if source_name == "tmk_cnt": source_header = [ "Shirt number", "Position", "Name", "Date of birth", "Nationality", "Height", "Foot", "Joined", "Signed from", "Contract expires", "Market value", ] drop_cols = ["Nationality", "Signed from", "Competition"] notna_cols = ["Market value"] elif source_name == "tmk_psm": source_header = [ "Shirt number", "Position", "Name", "Age", "Nationality", "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ] drop_cols = ["Nationality"] notna_cols = ["In squad"] df = utilities.folder_loader(source_name[:3], source_name, "comp_season", source_header=source_header) ## Name and Position are mis-aligned in the source files df["Name"].fillna(method="bfill", inplace=True) df["Position"] = df.Name.shift(-1) df.loc[df.Position == df.Name, "Position"] = df.Name.shift(-2) df.drop(axis=1, columns=drop_cols, inplace=True) df.dropna(subset=notna_cols, inplace=True) df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) df = df.replace("-", np.nan) df = df.replace("Was not used during this season", np.nan) df = df.replace("Not in squad during this season", np.nan) df = df.replace("Not used during this season", np.nan) df["Shirt number"] = pd.to_numeric(df["Shirt number"], downcast="integer") df["Position group"] = None df.loc[(df.Position.str.upper().str.contains("KEEPER")) | (df.Position.str.upper().str.contains("GOAL")), "Position group", ] = "G" df.loc[(df.Position.str.upper().str.contains("BACK")) | (df.Position.str.upper().str.contains("DEF")), "Position group", ] = "D" df.loc[(df.Position.str.upper().str.contains("MID")) | (df.Position.str.upper().str.contains("MIT")) | (df.Position.str.upper().str.contains("WING")), "Position group", ] = "M" df.loc[(df.Position.str.upper().str.contains("STRIKER")) | (df.Position.str.upper().str.contains("FORW")), "Position group", ] = "F" if source_name == "tmk_cnt": df["Age"] = (df["Date of birth"].str.extract( r".*([0-9]{2})", expand=False).astype("int")) df["Date of birth"] = pd.to_datetime( df["Date of birth"].str.extract(r"(.*) \([0-9]{2}\)", expand=False), format="%b %d, %Y", ) df["Joined"] = pd.to_datetime(df.Joined, format="%b %d, %Y") df["Contract expires"] = pd.to_datetime(df["Contract expires"], format="%d.%m.%Y") df["Height"] = (df["Height"].str.strip().str.replace( " ", "").str.replace(",", "").str.replace("m", "").replace({ "-": np.nan, "": np.nan }).astype(float)) df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values), "Height", ] = ( df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values)]. sort_values(by=["Name", "Season"]).Height.fillna( method="bfill")) df.loc[df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values), "Foot", ] = (df.loc[ df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values)].sort_values( by=["Name", "Season"]).Foot.fillna(method="bfill")) df["Market value"] = ( df["Market value"].str.strip().replace({ "-": np.nan }).replace(r"[£kmTh\.]", "", regex=True).astype(float) * df["Market value"].str.extract( r"[\d\.]+([kmTh\.]+)", expand=False).fillna(1).replace( ["k", "Th.", "m"], [10**3, 10**3, 10**6]).astype(int) / 10**6) elif source_name == "tmk_psm": df["PPG"] = df["PPG"].str.strip().replace(r"[,]", ".", regex=True).astype(float) df["Minutes played"] = (df["Minutes played"].str.strip().replace( r"[.\']", "", regex=True).astype(float)) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].fillna(0) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].astype(float) logging.debug(df.describe(include="all")) logging.info("Saving processed data to ") utilities.save_master(df, get_outfile(source_name), directory=directory) return df