Пример #1
0
def build_event_data(competition_id, event_type, directory=config.MASTER_DIR):
    """Build formatted event data from source files.

    Args:
        event_type: String value of event type
        directory: Folder for processed data

    Returns:
        None
    """
    logging.info("Building event data")

    comps = utilities.folder_loader("stb", "competitions")
    logging.debug(comps.info())
    matches = utilities.folder_loader("stb", "matches")
    logging.debug(matches.info())
    events = utilities.folder_loader("stb", "events", "match_event")
    logging.debug(events.info())

    data = (
        comps.loc[
            comps.competition_id == competition_id,
            ["season_id", "country_name", "competition_name", "season_name"], ]
        .merge(
            matches.loc[matches.competition == competition_id,
                        ["match_id", "match_date", "kick_off", "season"], ],
            how="inner",
            left_on="season_id",
            right_on="season",
        ).merge(
            events.loc[:, [
                "event_type",
                "period",
                "minute",
                "team",
                "player",
                "statsbomb_xg",
                "type",
                "outcome",
                "start_location_x",
                "start_location_y",
                "end_location_x",
                "end_location_y",
                "end_location_z",
                "match_id",
            ], ],
            how="inner",
            on="match_id",
        ))
    logging.debug(data.info())

    utilities.save_master(data,
                          "events_{0}".format(event_type),
                          directory=directory)

    return data
Пример #2
0
def format_stadiums(
    dgl_file=config.STADIUMS_SCRAPE["dgl"][1],
    ops_file=config.STADIUMS_SCRAPE["ops"][1],
    directoryOut=config.MASTER_DIR,
):
    """Format stadiums data.

    INPUT:
        dgl_file: Path for "dgl" stadiums file
        ops_file: Path for "ops" stadiums file
        directoryOut: Direcory to save formatted data to

    OUTPUT:
        None
    """
    logging.info("Formatting stadiums")

    # dgl_file = config.STADIUMS_SCRAPE["dgl"][1]
    logging.info("Parsing: {0}".format(dgl_file))
    dgl = pd.read_csv(dgl_file, encoding="utf8", sep=",")
    dgl.rename(columns={"Name": "Stadium"}, inplace=True)
    dgl.set_index("Team", inplace=True)
    logging.debug("\n{0}".format(dgl))

    # ops_file = config.STADIUMS_SCRAPE["ops"][1]
    logging.info("Parsing: {0}".format(ops_file))
    ops = pd.read_csv(ops_file, encoding="utf8", sep=",")
    ops.rename(columns={"Team": "TeamFull", "FDCOUK": "Team"}, inplace=True)
    ops.set_index("Team", inplace=True)
    logging.debug("\n{0}".format(ops))

    ## TODO - fuzzy matching teams? (name inconsistencies?)

    logging.info("Create combined stadiums data")
    # combo = pd.merge(dgl, ops, left_on='Team', right_on='FDCOUK', how='inner')
    combo = ops.combine_first(dgl)
    combo.reset_index(level=0, inplace=True)
    logging.debug("\n{0}".format(combo))

    utilities.save_master(combo, "stadiums", directory=directoryOut)

    return
Пример #3
0
def format_matches(
    directoryOut=config.MASTER_DIR,
):
    """Format national team match data.

    INPUT:
        directoryOut: Directory to save formatted data to

    OUTPUT:
        match: National match data dataframe
    """
    logging.info("Formatting national team match data")

    comp = utilities.folder_loader(
        "fbr",
        "competition",
        source_header=[
            "Round",
            "Wk",
            "Day",
            "Date",
            "Time",
            "Team_1",
            "Score",
            "Team_2",
            "Attendance",
            "Venue",
            "Referee",
            "Match Report",
            "Notes",
        ],
    )
    comp2 = utilities.folder_loader(
        "fbr",
        "competition2",
        source_header=[
            "Round",
            "Wk",
            "Day",
            "Date",
            "Time",
            "Team_1",
            "xG_1",
            "Score",
            "xG_2",
            "Team_2",
            "Attendance",
            "Venue",
            "Referee",
            "Match Report",
            "Notes",
        ],
    )
    comp = pd.concat([comp, comp2], axis=0, sort=False, ignore_index=True)
    comp.dropna(subset=["Round"], inplace=True)
    comp.reset_index(drop=True, inplace=True)
    comp["Year"] = comp.Date.str[:4]
    comp["Team_abbrev_1"] = comp["Team_1"].str[-3:].str.strip()
    comp["Team_1"] = comp["Team_1"].str[:-3].str.strip()
    comp["Team_abbrev_2"] = comp["Team_2"].str[:3].str.strip()
    comp["Team_2"] = comp["Team_2"].str[3:].str.strip()
    comp["Goals_1"] = comp.Score.str.extract(pat="(?:^|\) )([0-9]{1,2})[^0-9]+[0-9]{1,2}")
    comp["Goals_2"] = comp.Score.str.extract(pat="[0-9]{1,2}[^0-9]+([0-9]{1,2})(?:$| \()")
    for i in range(1, 3):
        comp["Goals_" + str(i)] = pd.to_numeric(
            comp["Goals_" + str(i)], errors="coerce"
        )
    comp["Goal_diff"] = comp.Goals_1 - comp.Goals_2
    logging.debug("\n{0}".format(comp.info()))

    venue = pd.read_csv(
        os.path.join(config.SOURCE_DIR, "wkp", "wkp_std", "wkp_std_nat.csv"),
        encoding="latin9",
        sep=",",
    )
    venue.columns = ["Venue_country", "Venue_city", "Venue", "Venue_URL"]
    logging.debug("\n{0}".format(venue.info()))

    match = pd.merge(comp, venue, on="Venue", how="left")

    ## workaround for venues that aren't mapping
    match.loc[match.Venue == "Stadion Energa Gdańsk", "Venue_country"] = "Poland"
    match.loc[match.Venue == "Bakı Olimpiya Stadionu", "Venue_country"] = "Azerbaijan"
    match.loc[match.Venue == "Arena Naţională", "Venue_country"] = "Romania"

    for i in range(1, 3):
        match["Home_" + str(i)] = 0
        match.loc[match["Team_" + str(i)] == match.Venue_country, "Home_" + str(i)] = 1

    logging.debug("\n{0}".format(match.info()))

    utilities.save_master(match, "nations_matches", directory=directoryOut)

    return match
Пример #4
0
def format_summaries(
    directoryOut=config.MASTER_DIR,
):
    """Format national statistical summary data.

    INPUT:
        directoryOut: Directory to save formatted data to

    OUTPUT:
        None
    """
    logging.info("Formatting national statistical summary data")

    elo_list = []
    for file in os.listdir(os.path.join(config.SOURCE_DIR, "elo")):
        if not file.endswith(".json"):
            continue

        df_json = pd.read_json(os.path.join(config.SOURCE_DIR, "elo", file))
        for i in df_json.index.values:
            df_tmp = pd.DataFrame(df_json.Team[i], index=[i])
            for col in df_tmp.columns.drop("Team"):
                df_tmp[col] = (
                    df_tmp[col].str.replace("−", "-").str.replace("+", "").astype(int)
                )
            df_tmp["Filename"] = file
            df_tmp["Year"] = int(file[:4])
            elo_list.append(df_tmp)

    # print(len(elo_list))

    elo = pd.concat(elo_list, ignore_index=True)

    elo.loc[elo.Team == "Czechia", "Team"] = "Czech Republic"
    elo.loc[elo.Team == "Yugoslavia", "Team"] = "Serbia"
    elo.loc[elo.Team == "Ireland", "Team"] = "Republic of Ireland"
    elo["Country"] = elo.Team
    elo.loc[
        elo.Team.isin(["England", "Scotland", "Wales", "Northern Ireland"]), "Country"
    ] = "United Kingdom"
    elo.loc[(elo.Team == "Russia"), "Country"] = "Russian Federation"

    # elo.describe().T

    penn = pd.read_excel(
        os.path.join(config.SOURCE_DIR, "rug", "pwt100.xlsx"), sheet_name="Data"
    )
    penn = penn[["country", "year", "rgdpe", "pop"]]
    penn.columns = ["Country", "Data Year", "GDP (PPP)", "Population"]
    penn.dropna(axis="index", inplace=True)
    penn.sort_values(by=["Data Year"], inplace=True)
    penn.loc[penn.Country == "Ireland", "Country"] = "Republic of Ireland"
    # penn.info()

    summary = pd.merge_asof(
        elo,
        penn,
        left_by="Country",
        right_by="Country",
        left_on="Year",
        right_on="Data Year",
        tolerance=4,
        allow_exact_matches=False,
    )

    # summary.info()
    utilities.save_master(summary, "nations_summaries", directory=directoryOut)

    return summary
Пример #5
0
def calculate_quality(directory=config.MASTER_DIR):
    """Calculate data quality.

    INPUT:
        directory: Location of data to assess

    OUTPUT:
        overall_score: Overall quality score across all files
    """
    logging.info("Calculating data quality")

    calc_date = datetime.datetime.today().strftime("%Y-%m-%d")
    logging.info("Current date is {0}".format(calc_date))

    dq_data = []
    for file in os.listdir(directory):
        if file == "ftb_quality.txt":
            continue
        if not file.endswith(".txt"):
            continue
        logging.info("Assessing {0}".format(file))

        file_path = os.path.join(directory, file)

        file_date = datetime.datetime.fromtimestamp(
            os.path.getmtime(file_path)).strftime("%Y-%m-%d")
        logging.info("File modification date is {0}".format(calc_date))

        file_stub = file.replace("ftb_", "").replace(".txt", "")
        df = utilities.get_master(file_stub, directory=directory)
        if df.shape[0] > 50000:
            df = df.sample(50000, replace=False, random_state=42)

        no_of_rows, no_of_columns = df.shape
        no_of_cells = no_of_rows * no_of_columns

        # Consistency, coherence, or clarity
        category = "Consistency"
        logging.info("Running {0} tests".format(category))

        test = None
        if file in ["ftb_events_shot.txt"]:
            test = "xG between 0 and 1"
            score = 1 - (df[~df.statsbomb_xg.between(0, 1)].shape[0] /
                         df.shape[0])
        elif file in ["ftb_fulldata.txt"]:
            test = "Goals <= Shots"
            score = 1 - (df[df["Goals"] > df["Shots"]].shape[0] / df.shape[0])
        elif file in ["ftb_managers.txt"]:
            test = "DateFrom <= DateTo"
            score = 1 - (df[df.DateFrom > df.DateTo].shape[0] / df.shape[0])
        elif file in ["ftb_players_contract.txt"]:
            test = "Joined <= Contract expires"
            score = 1 - (df[df["Joined"] > df["Contract expires"]].shape[0] /
                         df.shape[0])
        elif file in ["ftb_players_performance.txt"]:
            test = "Games started <= In squad"
            score = 1 - (df[df["Games started"] > df["In squad"]].shape[0] /
                         df.shape[0])
        elif file in ["ftb_results.txt"]:
            test = "Home goals <= Home shots"
            score = 1 - (df[df["FTHG"] > df["HS"]].shape[0] / df.shape[0])
        elif file in ["ftb_nations_matches.txt"]:
            test = "Max one home team"
            score = 1 - (df[df["Home_1"] + df["Home_2"] == 2].shape[0] /
                         df.shape[0])

        if test:
            dq_data.append({
                "file": file_stub,
                "file_date": file_date,
                "calc_date": calc_date,
                "category": category,
                "test": test,
                "score": score,
            })

        # Completeness or comprehensiveness
        category = "Completeness"
        logging.info("Running {0} tests".format(category))

        test = "Missing values"
        score = df.count().sum() / no_of_cells
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Timeliness or latency
        category = "Timeliness"
        logging.info("Running {0} tests".format(category))

        test = "Days since file updated"
        score = max(
            1 - ((int(calc_date.replace("-", "")) -
                  int(file_date.replace("-", ""))) / 100000),
            0,
        )
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        date_field = None
        if file in [
                "ftb_fulldata.txt", "ftb_results.txt",
                "ftb_nations_matches.txt"
        ]:
            date_field = "Date"
        elif file in ["ftb_events_shot.txt"]:
            date_field = "match_date"
        elif file in ["ftb_managers.txt"]:
            date_field = "DateTo"

        if date_field:
            test = "Days since last match date"
            score = max(
                1 - ((int(calc_date.replace("-", "")) -
                      int(df[date_field].max().replace("-", ""))) / 100000),
                0,
            )
            dq_data.append({
                "file": file_stub,
                "file_date": file_date,
                "calc_date": calc_date,
                "category": category,
                "test": test,
                "score": score,
            })

        # Accuracy or correctness
        category = "Accuracy"
        logging.info("Running {0} tests".format(category))

        # Spots tests against ref data?
        # Wikipedia (Ajax, Frankfurt)
        # Don Balon (Spain 08/09)
        # SkySports Football Yearbook (England & Scotland 07/08)

        # Uniqueness
        category = "Uniqueness"
        logging.info("Running {0} tests".format(category))

        test = "Duplicated rows"
        score = df.drop_duplicates().shape[0] / no_of_rows
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        test = "Duplicated columns"
        score = df.T.drop_duplicates().T.shape[0] / no_of_rows
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Validity or reasonableness
        category = "Validity"
        logging.info("Running {0} tests".format(category))

        # 0 <= Goals <= 10

        test = "3 stdev from mean"
        score = 1 - (
            ((df < (df.mean() - 3 * df.std())) |
             (df > df.mean() + 3 * df.std())).sum().sum() / no_of_cells)
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        test = "1.5 IQR rule"
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        score = 1 - (((df < (Q1 - 1.5 * IQR)) |
                      (df > (Q3 + 1.5 * IQR))).sum().sum() / no_of_cells)
        dq_data.append({
            "file": file_stub,
            "file_date": file_date,
            "calc_date": calc_date,
            "category": category,
            "test": test,
            "score": score,
        })

        # Orderliness
        # Auditability
        # Conformity
        # accessibility or availability
        # comparability
        # credibility, reliability, or reputation
        # relevance, pertinence, or usefulness

    df_dq = pd.DataFrame(dq_data)
    df_dq["score"] = df_dq["score"].clip(lower=0, upper=1)
    utilities.save_master(df_dq, "quality", directory=directory)

    overall_score = df_dq.score.mean()
    logging.info("Overall score is {0}".format(overall_score))

    return overall_score
Пример #6
0
def build_fulldata(directory=config.MASTER_DIR):
    """Combine results, stadiums and managers data into full dataset for clubs.

    INPUT:
        directory: Directory to save output to

    OUTPUT:
        fulldata: Dataframe containing all the clubs data
    """
    logging.info(
        "Building fulldata dataframe from results, stadiums, managers ...")

    home_renames = {
        "HomeTeam": "Team",
        "AwayTeam": "TeamOpp",
        "FTHG": "Goals",
        "FTAG": "GoalsOpp",
        "HTHG": "Goals1stHalf",
        "HTAG": "Goals1stHalfOpp",
        "HS": "Shots",
        "AS": "ShotsOpp",
        "HST": "ShotsOnTarget",
        "AST": "ShotsOnTargetOpp",
        "HHW": "ShotsHitWoodwork",
        "AHW": "ShotsHitWoodworkOpp",
        "HC": "Corners",
        "AC": "CornersOpp",
        "HF": "Fouls",
        "AF": "FoulsOpp",
        "HO": "Offsides",
        "AO": "OffsidesOpp",
        "HY": "YellowCards",
        "AY": "YellowCardsOpp",
        "HR": "RedCards",
        "AR": "RedCardsOpp",
        "HBP": "BookingPoints",
        "ABP": "BookingPointsOpp",
    }
    away_renames = {}
    for key, val in home_renames.items():
        if val.endswith("Opp"):
            away_renames[key] = val[:-3]
        else:
            away_renames[key] = val + "Opp"
    stat_to_diff = [
        "Goals",
        "Goals1stHalf",
        "Shots",
        "ShotsOnTarget",
        "ShotsHitWoodwork",
        "Corners",
        "Fouls",
        "Offsides",
        "YellowCards",
        "RedCards",
        "BookingPoints",
    ]
    # logging.debug(list(away_renames))
    # logging.debug(list(home_renames))

    logging.info("Process results")
    results = utilities.get_master("results", directory=directory)

    homeresults = results.rename(columns=home_renames)
    homeresults["HomeAway"] = "Home"
    # homeresults.info()
    # logging.debug(homeresults.describe(include="all"))

    awayresults = results.rename(columns=away_renames)
    awayresults["HomeAway"] = "Away"
    # awayresults.info()
    # logging.debug(homeresults.describe(include="all"))

    allresults = pd.concat([homeresults, awayresults],
                           ignore_index=True,
                           sort=False)
    allresults.drop(["FTR", "HTR", "Unnamed: 0"], axis=1, inplace=True)

    # logging.debug(allresults[(allresults['Team']=="Middlesbrough")&(allresults['Season']=="2006-2007")]["Date"].min())

    for stat in stat_to_diff:
        allresults[stat + "Diff"] = allresults[stat] - allresults[stat + "Opp"]
        allresults["Total" +
                   stat] = allresults[stat] + allresults[stat + "Opp"]

    allresults[
        "Saves"] = allresults["ShotsOnTargetOpp"] - allresults["GoalsOpp"]
    allresults["SavesOpp"] = allresults["ShotsOnTarget"] - allresults["Goals"]
    allresults["SavesDiff"] = allresults["Saves"] - allresults["SavesOpp"]
    allresults[
        "Goals2ndHalf"] = allresults["Goals"] - allresults["Goals1stHalf"]
    allresults["Goals2ndHalfOpp"] = (allresults["GoalsOpp"] -
                                     allresults["Goals1stHalfOpp"])
    allresults["Goals2ndHalfDiff"] = (allresults["GoalsDiff"] -
                                      allresults["Goals1stHalfDiff"])
    # Result,Points,PointsOpp,Win,WinDraw,Draw,DrawLoss,Loss,CleanSheet,CleanSheetOpp
    (
        allresults["Result"],
        allresults["Points"],
        allresults["PointsOpp"],
        allresults["Win"],
        allresults["WinDraw"],
        allresults["Draw"],
        allresults["DrawLoss"],
        allresults["Loss"],
        allresults["WinShare"],
    ) = zip(*allresults["GoalsDiff"].map(func_score))
    allresults["CleanSheet"] = allresults["Goals"].map(func_nogoal)
    allresults["CleanSheetOpp"] = allresults["GoalsOpp"].map(func_nogoal)

    # allresults['Date'] = pd.to_datetime(allresults['Date'], format="%d/%m/%y")

    allresults["GameWeek"] = (allresults.sort_values("Date").groupby(
        ["Season", "Div", "Team"]).cumcount() + 1)

    ## TODO - Validate derived values

    logging.info("Process stadiums")
    stadiums = utilities.get_master("stadiums", directory=directory)
    stadiums.drop(["Country", "TeamFull"], axis=1, inplace=True)

    fulldata = pd.merge(allresults, stadiums, on="Team", how="left")
    # fulldata.drop(['Unnamed: 0'], axis=1, inplace=True)
    stadiums.rename(columns={"Team": "TeamOpp"}, inplace=True)
    fulldata = pd.merge(fulldata,
                        stadiums,
                        on="TeamOpp",
                        how="left",
                        suffixes=("", "Opp"))
    fulldata.drop(["Unnamed: 0", "Unnamed: 0Opp"], axis=1, inplace=True)

    fulldata["EuclideanDistance"] = (
        (fulldata.Latitude - fulldata.LatitudeOpp)**2 +
        (fulldata.Longitude - fulldata.LongitudeOpp)**2)**0.5
    # logging.debug(100000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    logging.info("Process managers")
    managers = utilities.get_master("managers", directory=directory)
    managers.dropna(subset=["Manager"], inplace=True)

    fulldata = pd.merge(fulldata, managers, on="Team", how="left")
    # logging.debug(200000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"])
                         & (fulldata["Date"] <= fulldata["DateTo"]))
                        | (fulldata["Manager"].isnull())]
    # logging.debug(300000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata.drop(
        ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"],
        axis=1,
        inplace=True,
    )
    fulldata = fulldata.drop_duplicates()
    # fulldata.info()
    # logging.debug(400000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    managers.rename(columns={"Team": "TeamOpp"}, inplace=True)
    fulldata = pd.merge(fulldata,
                        managers,
                        on="TeamOpp",
                        how="left",
                        suffixes=("", "Opp"))
    # logging.debug(500000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])
    fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"])
                         & (fulldata["Date"] <= fulldata["DateTo"]))
                        | (fulldata["ManagerOpp"].isnull())]
    # logging.debug(600000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))
    fulldata.drop(
        ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"],
        axis=1,
        inplace=True,
    )
    fulldata = fulldata.drop_duplicates()
    # fulldata.info()
    # logging.debug(700000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]))

    # fulldata.info()
    # logging.debug(fulldata.describe(include="all"))
    # logging.debug(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['Season']=="2006-2007")]["Date"].min()#.describe(include="all"))

    utilities.save_master(fulldata, "fulldata", directory=directory)
    return fulldata
Пример #7
0
def format_results(
    parentDirectory=config.SOURCE_DIR,
    subDirectory=config.RESULTS_SCRAPE["ftd"][1],
    directoryOut=config.MASTER_DIR,
):
    """Format raw results and save processed output.

    INPUT:
        parentDirectory: Parent directory  to traverse looking for files to zip/clear
        subDirectory: Sub-Directory to traverse looking for files to zip/clear
        directoryOut: Directory to save output to

    OUTPUT:
        None
    """
    directoryIn = os.path.join(parentDirectory, subDirectory)
    logging.info("Format results in {0}".format(directoryIn))
    pieces = []
    core_cols = ["Div", "Date"]  # ,'HomeTeam','AwayTeam','FTHG','FTAG','FTR']
    use_cols = [
        "Season",
        "Div",
        "Country",
        "Tier",
        "Date",
        "HomeTeam",
        "AwayTeam",
        "FTHG",
        "FTAG",
        "FTR",
        "HTHG",
        "HTAG",
        "HTR",
        "Attendance",
        "Referee",
        "HS",
        "AS",
        "HST",
        "AST",
        "HHW",
        "AHW",
        "HC",
        "AC",
        "HF",
        "AF",
        "HO",
        "AO",
        "HY",
        "AY",
        "HR",
        "AR",
        "HBP",
        "ABP",
    ]

    for root, _dirs, files in os.walk(directoryIn):
        for file in files:
            if file.endswith(".csv"):
                # logging.info(root)
                filepath = os.path.join(root, file)
                logging.info("Filepath: {0}".format(filepath))
                # logging.info(root[-9:])
                # try:
                df = pd.read_csv(
                    filepath,
                    error_bad_lines=False,
                    warn_bad_lines=False,
                    encoding="latin9",
                )  # , parse_dates=['Date'])
                logging.debug("Input columns: {0}".format(df.columns))
                # df['File'] = file
                df["Season"] = root[-9:]

                if set(["HomeTeam", "AwayTeam"]).issubset(df.columns):
                    # logging.info(df[["HomeTeam", "AwayTeam"]].head())
                    try:
                        df["HomeTeam"] = df[
                            "HomeTeam"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                        df["AwayTeam"] = df[
                            "AwayTeam"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                    except BaseException:
                        df["HomeTeam"] = np.nan
                        df["AwayTeam"] = np.nan

                elif set(["HT", "AT"]).issubset(df.columns):
                    # logging.info(df[["HT", "AT"]].head())
                    try:
                        df["HomeTeam"] = df[
                            "HT"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                        df["AwayTeam"] = df[
                            "AT"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                    except BaseException:
                        df["HomeTeam"] = np.nan
                        df["AwayTeam"] = np.nan
                else:
                    raise
                # logging.info(df[["HomeTeam", "AwayTeam"]].head())

                # drop useless rows
                df = df.dropna(subset=core_cols)
                logging.debug("Output columns: {0}".format(df.columns))

                pieces.append(df)
                # except:
                #     logging.info("read_csv FAILED: "+os.path.join(root, file))
                # logging.info(df.count())

    logging.info("Concatenate everything into a single DataFrame")
    dframe = pd.concat(pieces, ignore_index=True, sort=False)

    dframe["Country"], dframe["Tier"] = zip(*dframe["Div"].map(func_div))

    # dframe["Date"] = pd.to_datetime(dframe['Date'], format='%d/%m/%y')
    dframe.Date = pd.to_datetime(dframe.Date, dayfirst=True)
    logging.info(dframe[use_cols].info())

    # logging.info(dframe[((dframe['HomeTeam']=="Middlesbrough")|(dframe['AwayTeam']=="Middlesbrough"))&(dframe['Season']=="2006-2007")][["Date", "HomeTeam", "AwayTeam"]])
    utilities.save_master(
        dframe[use_cols], "results", directory=directoryOut
    )  # , enc="ascii")
Пример #8
0
 def test_save_master(self):
     """Test save dummy dataframe to dummy csv."""
     assert (utilities.save_master(
         self.testFrame, "dummy",
         directory=self.testDir) == self.testMaster)
Пример #9
0
 def test_get_master(self):
     """Test return dummy dataframe from dummy csv."""
     utilities.save_master(self.testFrame, "dummy", directory=self.testDir)
     utilities.get_master(
         "dummy", directory=self.testDir).shape == self.testFrame.shape
Пример #10
0
def clean_data(source_name, directory=config.MASTER_DIR):
    """Clean raw player data and save processed version.

    INPUT:
        source_name: String containing name of the data source
        directory: Directory to save output to

    OUTPUT:
        df: Dataframe containing the cleaned data
    """
    logging.info("Loading {0} data".format(source_name))

    if source_name == "tmk_cnt":
        source_header = [
            "Shirt number",
            "Position",
            "Name",
            "Date of birth",
            "Nationality",
            "Height",
            "Foot",
            "Joined",
            "Signed from",
            "Contract expires",
            "Market value",
        ]
        drop_cols = ["Nationality", "Signed from", "Competition"]
        notna_cols = ["Market value"]

    elif source_name == "tmk_psm":
        source_header = [
            "Shirt number",
            "Position",
            "Name",
            "Age",
            "Nationality",
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]
        drop_cols = ["Nationality"]
        notna_cols = ["In squad"]

    df = utilities.folder_loader(source_name[:3],
                                 source_name,
                                 "comp_season",
                                 source_header=source_header)

    ## Name and Position are mis-aligned in the source files

    df["Name"].fillna(method="bfill", inplace=True)

    df["Position"] = df.Name.shift(-1)
    df.loc[df.Position == df.Name, "Position"] = df.Name.shift(-2)

    df.drop(axis=1, columns=drop_cols, inplace=True)

    df.dropna(subset=notna_cols, inplace=True)

    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    df = df.replace("-", np.nan)
    df = df.replace("Was not used during this season", np.nan)
    df = df.replace("Not in squad during this season", np.nan)
    df = df.replace("Not used during this season", np.nan)

    df["Shirt number"] = pd.to_numeric(df["Shirt number"], downcast="integer")

    df["Position group"] = None
    df.loc[(df.Position.str.upper().str.contains("KEEPER"))
           | (df.Position.str.upper().str.contains("GOAL")),
           "Position group", ] = "G"
    df.loc[(df.Position.str.upper().str.contains("BACK"))
           | (df.Position.str.upper().str.contains("DEF")),
           "Position group", ] = "D"
    df.loc[(df.Position.str.upper().str.contains("MID"))
           | (df.Position.str.upper().str.contains("MIT"))
           | (df.Position.str.upper().str.contains("WING")),
           "Position group", ] = "M"
    df.loc[(df.Position.str.upper().str.contains("STRIKER"))
           | (df.Position.str.upper().str.contains("FORW")),
           "Position group", ] = "F"

    if source_name == "tmk_cnt":
        df["Age"] = (df["Date of birth"].str.extract(
            r".*([0-9]{2})", expand=False).astype("int"))

        df["Date of birth"] = pd.to_datetime(
            df["Date of birth"].str.extract(r"(.*) \([0-9]{2}\)",
                                            expand=False),
            format="%b %d, %Y",
        )

        df["Joined"] = pd.to_datetime(df.Joined, format="%b %d, %Y")

        df["Contract expires"] = pd.to_datetime(df["Contract expires"],
                                                format="%d.%m.%Y")

        df["Height"] = (df["Height"].str.strip().str.replace(
            " ", "").str.replace(",", "").str.replace("m", "").replace({
                "-":
                np.nan,
                "":
                np.nan
            }).astype(float))
        df.loc[df.Name.isin(df[df.Height.notna()].Name.values)
               & df.Name.isin(df[df.Height.isna()].Name.values),
               "Height", ] = (
                   df.loc[df.Name.isin(df[df.Height.notna()].Name.values)
                          & df.Name.isin(df[df.Height.isna()].Name.values)].
                   sort_values(by=["Name", "Season"]).Height.fillna(
                       method="bfill"))

        df.loc[df.Name.isin(df[df.Foot.notna()].Name.values)
               & df.Name.isin(df[df.Foot.isna()].Name.values),
               "Foot", ] = (df.loc[
                   df.Name.isin(df[df.Foot.notna()].Name.values)
                   & df.Name.isin(df[df.Foot.isna()].Name.values)].sort_values(
                       by=["Name", "Season"]).Foot.fillna(method="bfill"))

        df["Market value"] = (
            df["Market value"].str.strip().replace({
                "-": np.nan
            }).replace(r"[£kmTh\.]", "", regex=True).astype(float) *
            df["Market value"].str.extract(
                r"[\d\.]+([kmTh\.]+)", expand=False).fillna(1).replace(
                    ["k", "Th.", "m"], [10**3, 10**3, 10**6]).astype(int) /
            10**6)

    elif source_name == "tmk_psm":
        df["PPG"] = df["PPG"].str.strip().replace(r"[,]", ".",
                                                  regex=True).astype(float)
        df["Minutes played"] = (df["Minutes played"].str.strip().replace(
            r"[.\']", "", regex=True).astype(float))

        df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]] = df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]].fillna(0)

        df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]] = df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]].astype(float)
    logging.debug(df.describe(include="all"))

    logging.info("Saving processed data to ")
    utilities.save_master(df, get_outfile(source_name), directory=directory)

    return df