def cleanse_tracking_data(game): """ Function to clean the Metrica tracking data. Despite the obvious steps, the ball data is also cleaned in such a way that it floats more smoothly and has less hickups. :param game: (int) GameId :return: None """ logging.info(f"Cleansing metrica tracking data for game {game}") # read the raw data of the home and away team df_home = io.read_data("home_team_tracking", league=str(game), data_folder="raw_data_metrica") df_away = io.read_data("away_team_tracking", league=str(game), data_folder="raw_data_metrica") # extract the ball data and clean it to have less df_ball = extract_ball_data(df_home) df_ball = cleanse_ball_tracking_data(df_ball, game) # convert the data frames into a long format to be able to work with them more easily df_home = convert_to_long_data_frame(df_home, "Home") df_away = convert_to_long_data_frame(df_away, "Away") # convert position into meters df_players = pd.concat([df_home, df_away]) df_players = convert_positions_to_meters(df_players) df_players["xPosMetrica"] = df_players["xPos"].copy() df_players["yPosMetrica"] = df_players["yPos"].copy() # combine player data with ball data df_all = pd.concat([df_players, df_ball]) df_all.sort_values(["frame", "playerId"], inplace=True) df_all.drop(["outOfBounds", "ballOut"], axis=1, inplace=True) df_all = pd.merge(df_all, df_ball[["frame", "outOfBounds", "ballOut"]], on="frame", how="left") # consider whether the ball is in play rather than whether it is out df_all.rename(columns={"ballOut": "ballInPlay"}, inplace=True) df_all["ballInPlay"] = 1 - df_all["ballInPlay"] df_all.drop(["outOfBounds", "outPeriod"], axis=1, inplace=True) # save to parquet file io.write_data(df_all, "tracking_data", league=str(game), data_folder="metrica_data")
def __init__(self, debug_mode=True): super().__init__(debug_mode) # if debug mode, read all the event data and extract the events before if self.debug_mode: df_events = io.read_event_data("all", notebook="expected_goal_model") self.df_events = self._compute_event_before(df_events) else: self.df_events = None # trained model will be saved here self.model = None # model name for reading self.model_name = "expected_goals_logreg" # features used in the model df_features = io.read_data("features_expected_goals_logreg", data_folder="model", sep=";") self.features = list(df_features[df_features["used"] == 1]["feature"]) # mean values and standard deviations for this model self.feature_measures = {} for i, row in df_features.iterrows(): self.feature_measures[row["feature"]] = {} self.feature_measures[row["feature"]]["mean"] = row["mean"] self.feature_measures[row["feature"]]["std"] = row["std"]
def cleanse_metrica_event_data(game, reverse): """ Function to clean the Metrica event data. Notice that quite a lot of the code is needed to make the Metrica data compatible with the Wyscout format :param game: (int) GameId :param reverse: (bool) If True, the away team is playing left to right in the first half :return: None """ logging.info(f"Cleansing metrica event data for game {game}") df_events = io.read_data("event_data", league=str(game), sep=",", data_folder="raw_data_metrica") # rename columns to camelStyle df_events.columns = [ "team", "type", "subtype", "period", "startFrame", "startTime", "endFrame", "endTime", "from", "to", "xPosStart", "yPosStart", "xPosEnd", "yPosEnd", ] # make sure that the position is in meters and events are always from the perspective of the # team having the event df_events = set_positions(df_events, reverse) # make sure that the end frame is always at least the start frame df_events["endFrame"] = df_events[["startFrame", "endFrame"]].max(axis=1) df_events["endTime"] = df_events[["startTime", "endTime"]].max(axis=1) df_events["subtype"].fillna(" ", inplace=True) # identify goals and own goals df_events["goal"] = 1 * (df_events.apply( lambda row: row["type"] == "SHOT" and "-GOAL" in row["subtype"], axis=1)) df_events["ownGoal"] = 1 * (df_events.apply( lambda row: row["type"] == "BALL OUT" and "-GOAL" in row["subtype"], axis=1)) df_events = compute_wyscout_columns(df_events, game) df_events.sort_values(["startFrame", "endFrame"], inplace=True) io.write_data(df_events, "event_data", league=str(game), data_folder="metrica_data")
def add_player_positions(df): """ Helper function to add the player positions (e.g. GK, MD, ...) to each player :param df: (pd.DataFrame) Data frame with event data :return: pd.DataFrame containing event data and the player position """ # read the position for all players df_positions = io.read_data("player_positions", sep=";", data_folder="raw_data_metrica") df = pd.merge(df, df_positions, on=["matchId", "playerId"]) return df
def cleanse_wyscout_match_data(country): """ Function to cleanse the wyscout match data and save it in the data folder :param country: (str) Country for which the event data should be cleansed :return: None """ logging.info(f"Cleansing wyscout match data for {country}") # read the JSON file with matches matches = io.read_data("match_data", league=country, data_folder="raw_data_wyscout") # save relevant information in data frame df_matches = pd.concat( [get_team_view(matches, 0), get_team_view(matches, 1)], axis=0) # attach the points per team df_matches["points"] = np.where( df_matches["score"] > df_matches["oppScore"], 3, np.where(df_matches["score"] == df_matches["oppScore"], 1, 0), ) df_matches["dateutc"] = pd.to_datetime(df_matches["dateutc"]) df_matches["scoreDiff"] = df_matches["score"] - df_matches["oppScore"] df_matches.sort_values(["matchId", "side"], ascending=[True, False], inplace=True) io.write_data(df_matches, "match_data", league=country.lower()) df_formations = get_all_formations(matches) io.write_data(df_formations, "formation_data", league=country.lower())
def cleanse_wyscout_event_data(country): """ Function to cleanse the wyscout event data and save it in the data folder :param country: (str) Country for which the event data should be cleansed :return: None """ logging.info(f"Cleansing wyscout event data for {country}") # read event data ################# events = io.read_data("event_data", league=country, data_folder="raw_data_wyscout") # normalize to get a pandas data frame df_events = pd.json_normalize(events) # save positions in different columns df_events["posBeforeX"] = df_events["positions"].map(lambda x: x[0]["x"]) df_events["posBeforeY"] = df_events["positions"].map(lambda x: x[0]["y"]) df_events["posAfterX"] = df_events["positions"].map( lambda x: x[1]["x"] if len(x) > 1 else np.nan) df_events["posAfterY"] = df_events["positions"].map( lambda x: x[1]["y"] if len(x) > 1 else np.nan) # save tags in different columns ################ # read the tags that contain a description for each event code tags = io.read_data("tags", sep=";", data_folder="raw_data_wyscout") dict_tags = {row["Tag"]: row["Description"] for _, row in tags.iterrows()} df_events["tags"] = df_events["tags"].map( lambda x: [tag["id"] for tag in x]) for key in dict_tags: df_events[ dict_tags[key]] = 1 * df_events["tags"].map(lambda x: key in x) # drop columns that are not needed df_events.drop(["positions", "tags"], axis=1, inplace=True) num_cols = ["subEventId"] for col in num_cols: df_events[col] = pd.to_numeric(df_events[col], errors="coerce") # make sure that the event "Offside" also leads to a subevent "Offside" df_events["subEventName"] = np.where(df_events["eventName"] == "Offside", "Offside", df_events["subEventName"]) # make sure the goal kick is always taken at the own goal df_events["posBeforeX"] = np.where( df_events["subEventName"] == "Goal kick", 5, df_events["posBeforeX"]) df_events["posBeforeY"] = np.where( df_events["subEventName"] == "Goal kick", 50, df_events["posBeforeY"]) # make sure the save attempt always happens at the own goal (currently at (0,0) or (100,100)) df_events["posBeforeX"] = np.where( df_events["subEventName"].isin(["Save attempt", "Reflexes"]), 0, df_events["posBeforeX"], ) df_events["posBeforeY"] = np.where( df_events["subEventName"].isin(["Save attempt", "Reflexes"]), 50, df_events["posBeforeY"], ) # change position of the event into meters ############## # read the field length and the field width with open(io._get_config_file(), "r", encoding="utf-8") as f: config = ruamel.yaml.YAML().load(f) field_length = config["general"]["field_length"] field_width = config["general"]["field_width"] # compute the position in meters df_events = add_position_in_meters( df_events, cols_length=["posBeforeX", "posAfterX"], cols_width=["posBeforeY", "posAfterY"], field_length=field_length, field_width=field_width, ) # Prepare the output table ########################## # drop columns that are not needed any more pos_cols = [ col for col in df_events.columns if col.startswith("Position:") ] cols_drop = [ "eventId", "subEventId", "posBeforeX", "posAfterX", "posBeforeY", "posAfterY", "Free space right", "Free space left", "Missed ball", "Take on left", "Take on right", "Sliding tackle", "Through", "Fairplay", "Lost", "Neutral", "Won", "Red card", "Yellow card", "Second yellow card", "Anticipated", "Anticipation", "High", "Low", "Interception", "Clearance", "Opportunity", "Feint", "Blocked", ] + pos_cols cols_drop = [col for col in cols_drop if col in df_events.columns] df_events.drop(cols_drop, axis=1, inplace=True) # add some player information ######################## df_players = io.read_data("player_data") df_players = df_players[[ "playerId", "playerName", "playerStrongFoot", "playerPosition" ]].copy() df_events = pd.merge(df_events, df_players, on="playerId", how="left") # add home and away team ######################## df_matches = io.read_data("match_data", league=country.lower()) for side in ["home", "away"]: df_side = df_matches[df_matches["side"] == side][["matchId", "teamId"]] df_side.rename(columns={"teamId": f"{side}TeamId"}, inplace=True) df_events = pd.merge(df_events, df_side, on="matchId", how="left") # compute the team that is currently in possession of the ball df_events["teamPossession"] = df_events.apply( lambda row: compute_possession(row), axis=1) # change column names to camelCase lowercase_cols = [col[0].lower() + col[1:] for col in df_events.columns] df_events.columns = lowercase_cols col_changes = { "own goal": "ownGoal", "key pass": "******", "counter attack": "counterAttack", "left foot": "leftFoot", "right foot": "rightFoot", "dangerous ball lost": "dangerousBallLost", "not accurate": "notAccurate", } df_events.rename(columns=col_changes, inplace=True) # bring columns into correct order col_order = [ "id", "matchId", "matchPeriod", "eventSec", "eventName", "subEventName", "teamId", "posBeforeXMeters", "posBeforeYMeters", "posAfterXMeters", "posAfterYMeters", "playerId", "playerName", "playerPosition", "playerStrongFoot", "teamPossession", "homeTeamId", "awayTeamId", "accurate", "notAccurate", ] other_cols = [col for col in df_events.columns if col not in col_order] col_order = col_order + other_cols df_events = df_events[col_order].copy() io.write_data(df_events, "event_data", league=country.lower())
def cleanse_wyscout_player_data(): """ Function to cleanse the wyscout player data and save the data in the data folder :return: None """ logging.info("Cleansing wyscout player data") # read the JSON file players = io.read_data("player_data", data_folder="raw_data_wyscout") # normalize to get a pandas data frame df_players = pd.json_normalize(players) # make sure the encoding is done correctly for col in df_players.select_dtypes("object").columns: try: df_players[col] = df_players[col].map( lambda x: codecs.unicode_escape_decode(x)[0]) except TypeError: pass # rename to playerId so that it can be easily merged with other tables df_players.rename(columns={"wyId": "playerId"}, inplace=True) df_players["birthDate"] = pd.to_datetime(df_players["birthDate"]) df_players["weight"] = np.where(df_players["weight"] > 0, df_players["weight"], np.nan) df_players["height"] = np.where(df_players["height"] > 0, df_players["height"], np.nan) df_players["foot"] = np.where(df_players["foot"].isin(["null", ""]), "unknown", df_players["foot"]) id_cols = ["currentTeamId", "currentNationalTeamId"] for col in id_cols: df_players[col] = pd.to_numeric(df_players[col], errors="coerce") # drop duplicates columns that are not needed drop_cols = [ "birthArea.alpha3code", "birthArea.alpha2code", "role.code3", "role.name", "passportArea.alpha3code", "passportArea.alpha2code", "middleName", "birthArea.id", "passportArea.id", ] df_players.drop(drop_cols, axis=1, inplace=True) df_players.rename( columns={ "role.code2": "playerPosition", "foot": "playerStrongFoot", "shortName": "playerName", }, inplace=True, ) cols_keep = [col for col in df_players.columns if col.startswith("player")] df_players = df_players[cols_keep].copy() io.write_data(df_players, data_type="player_data")
def cleanse_wyscout_team_data(country): """ Function to cleanse the wyscout team data and save the data in the data folder :param country: (str) Country for which the team data should be cleansed :return: None """ valid_countries = ["Germany", "England", "Spain", "Italy", "France"] if country not in valid_countries: raise KeyError( f"Country '{country}' not supported. Choose one out of: {', '.join(valid_countries)}" ) logging.info(f"Cleansing wyscout team data for {country}") # read the JSON file teams = io.read_data("team_data", data_folder="raw_data_wyscout") # normalize to get a pandas data frame df_teams = pd.json_normalize(teams) # make sure the encoding is done correctly for col in df_teams.select_dtypes("object").columns: try: df_teams[col] = df_teams[col].map( lambda x: codecs.unicode_escape_decode(x)[0]) except TypeError: pass df_teams.rename( columns={ "wyId": "teamId", "name": "teamName", "area.name": "country" }, inplace=True, ) # only keep club teams from the specified country df_teams = df_teams[(df_teams["type"] == "club") & (df_teams["country"] == country)].copy() df_teams = df_teams[["teamId", "teamName"]].copy() # attach the table to the teams to get a good feeling on how good each team is df_matches = io.read_data("match_data", league=country.lower()) df_table = gen_helper.get_table(df_matches) df_table.drop("week", axis=1, inplace=True) df_teams = pd.merge(df_teams, df_table, on="teamId", how="left") df_teams.sort_values("position", inplace=True) df_teams = df_teams[[ "position", "teamId", "teamName", "matches", "goals", "concededGoals", "goalsDiff", "points", ]].copy() io.write_data(df_teams, data_type="team_data", league=country.lower())