예제 #1
0
 def __init__(self, params):
     self.params = params
     self.model = None
     self.features = None
     self.target = None
     self.model_output_dir = "./data/model_outputs/"
     check_create_dir(self.model_output_dir)
예제 #2
0
    def train(self, xy_train, xy_test):
        if self.target == "star_target":
            metric_fn = accuracy
        else:
            metric_fn = mae
        fast_data = self.prepare_data(xy_train, xy_test)
        learn = tabular_learner(fast_data,
                                layers=[256, 128],
                                emb_drop=0.2,
                                metrics=metric_fn)
        learn.fit_one_cycle(4, 1e-4)  # set this to 4
        self.model = learn

        # get training results
        # tr = learn.validate(learn.data.train_dl)
        # va = learn.validate(learn.data.valid_dl)
        # print("The Metrics used In Evaluating The Network: {}".format(learn.metrics))
        # print("The Training Set Loss: {}".format(tr))
        # print("The Validation Set Loss: {}".format(va))

        # get test set predictions
        # test_predictions = learn.get_preds(ds_type=DatasetType.Test)[0]
        # xy_test["fastai_pred"] = test_predictions
        train_loss, valid_loss = learn.recorder.losses, learn.recorder.val_losses
        # save model
        save_dir = os.path.join(self.model_output_dir,
                                "fastai_{}_model".format(self.target))
        check_create_dir(save_dir)
        save_path = os.path.join(save_dir, "export.pkl")
        learn.export(file=save_path)
        return train_loss, valid_loss
예제 #3
0
    def __init__(self, cat_features, num_features, target):
        self.cat_features = cat_features
        self.num_features = num_features
        self.target = target
        self.model = None

        self.model_output_dir = "./data/model_outputs/"
        check_create_dir(self.model_output_dir)
예제 #4
0
    def __init__(self, config):

        self.config = config
        self.data_dir = os.path.join(config["source_dir"], config["season"])
        self.data_dir_raw = os.path.join(self.data_dir, 'raw')
        self.data_dir_clean = os.path.join(self.data_dir, 'clean')

        check_create_dir(self.data_dir_raw)
        check_create_dir(self.data_dir_clean)

        self.data_scraper = DataScraper(config)
예제 #5
0
    def __init__(self, config):
        """
        initialize data scraper
        :param config: config file specifying filepaths
        :type config: dict
        """
        self.name = "FPL-Scraper"
        self.config = config
        self.data = None
        self.data_dir = os.path.join(config["source_dir"], config["season"])
        check_create_dir(self.data_dir)
        # set fpl urls
        self.fpl_url = "https://fantasy.premierleague.com/api/"
        self.login_url = "https://users.premierleague.com/accounts/login/"
        self.manager_url = "https://fantasy.premierleague.com/api/entry/"
        self.classic_league_suburl = "leagues-classic/"
        self.team_entry_suburl = "entry/"
        self.bootstrap_suburl = "bootstrap-static/"
        self.player_suburl = "element-summary/"
        self.fixtures_suburl = "fixtures/"

        self.league_standing_url = self.fpl_url + self.classic_league_suburl

        try:
            self.username = os.environ["fpl_email"]
            self.password = os.environ["fpl_pwd"]
        except:
            print("Error: Set FPL Email and Password in your OS environment")

        payload = {
            'login': self.username,
            'password': self.password,
            'redirect_uri': "https://fantasy.premierleague.com/",
            'app': 'plfpl-web'
        }

        self.session = requests.session()
        self.session.post(self.login_url, data=payload)
예제 #6
0
def make_XY_data(scoring_gw=None, dataset_dir="./data/model_data/xy_data/"):
    # configs
    check_create_dir(dataset_dir)
    scraper_config = {"season": "2020_21", "source_dir": "./data/raw/"}
    data_scraper = DataScraper(scraper_config)

    if scoring_gw:
        pass
    else:
        print("getting latest scoring gameweek ...")
        scoring_gw = data_scraper.get_next_gameweek_id()

    fe_2020 = FeatureEngineering()
    config_2020 = {
        "data_dir": "./data/model_data/2020_21/",
        "file_fixture": "fixtures.csv",
        "file_team": "teams.csv",
        "file_gw": "merged_gw.csv",
        "file_player": "players_raw.csv",
        "file_understat_team": "understat_team_data.pkl",
        "scoring_gw": scoring_gw
    }

    df_2020 = fe_2020.execute_fe(config_2020)

    # for imputing opponent next in scoring df
    data_maker_2020 = ModelDataMaker(config_2020)
    tbf_feats = [
        "strength", "strength_overall_home", "strength_overall_away",
        "strength_attack_home", "strength_attack_away",
        "strength_defence_home", "strength_defence_away"
    ]
    tbf_feats = ["opp_" + feat for feat in tbf_feats]
    tbf_feats_next_1_map = dict()
    tbf_feats_next_2_map = dict()
    for feat in tbf_feats:
        tbf_feats_next_1_map[feat] = feat + "_next_1"
        tbf_feats_next_2_map[feat] = feat + "_next_2"

    df_next_1_gw = data_maker_2020.make_nth_gw_scoring_base(scoring_gw + 1)
    df_next_2_gw = data_maker_2020.make_nth_gw_scoring_base(scoring_gw + 2)

    df_next_1_gw = df_next_1_gw.rename(columns=tbf_feats_next_1_map)
    df_next_2_gw = df_next_2_gw.rename(columns=tbf_feats_next_2_map)
    df_next_1_gw = df_next_1_gw.drop(columns=["opp_id", "opp_name"])
    df_next_2_gw = df_next_2_gw.drop(columns=["opp_id", "opp_name"])
    # pdb.set_trace()

    fe_2019 = FeatureEngineering()
    config_2019 = {
        "data_dir": "./data/model_data/2019_20/",
        "file_fixture": "fixtures.csv",
        "file_team": "teams.csv",
        "file_gw": "merged_gw.csv",
        "file_player": "players_raw.csv",
        "file_understat_team": "understat_team_data.pkl",
        "scoring_gw": "NA"
    }
    df_2019 = fe_2019.execute_fe(config_2019)

    fe_2018 = FeatureEngineering()
    config_2018 = {
        "data_dir": "./data/model_data/2018_19/",
        "file_fixture": "fixtures.csv",
        "file_team": "teams.csv",
        "file_gw": "merged_gw.csv",
        "file_player": "players_raw.csv",
        "file_understat_team": "understat_team_data.pkl",
        "scoring_gw": "NA"
    }
    df_2018 = fe_2018.execute_fe(config_2018)

    df_2018["season_id"] = 0
    df_2019["season_id"] = 1
    df_2020["season_id"] = 2

    df_XY = pd.concat([df_2018, df_2019, df_2020])
    df_XY["global_gw_id"] = df_XY[["season_id",
                                   "gw_id"]].apply(lambda x: x[0] * 100 + x[1],
                                                   axis=1)

    # FIX: was home
    df_XY["was_home_lag_1"] = df_XY["was_home_lag_1"].astype(bool)
    df_XY["was_home_lag_2"] = df_XY["was_home_lag_2"].astype(bool)
    df_XY["was_home_lag_3"] = df_XY["was_home_lag_3"].astype(bool)

    # cat cols
    features_dict = fe_2020.feature_dict
    cat_features = features_dict["cat_features"]

    #
    cat_list = []
    type_dict = dict(df_XY.dtypes)
    for k, v in type_dict.items():
        if str(v) == 'object':
            cat_list.append(k)

    # print(cat_list)

    for feat in cat_features:
        if feat in cat_list:
            # print(feat)
            df_XY[feat] = df_XY[feat].astype('category').cat.codes

    pts_clip = 10
    star_clip = 5
    pot_clip = 24
    df_XY["reg_target"] = df_XY["total_points"].clip(upper=pts_clip)
    df_XY["star_target"] = df_XY["total_points"].apply(
        lambda x: 1 if x >= star_clip else 0)
    df_XY["pot_target"] = df_XY["potential"].clip(upper=pot_clip)

    df_XY["global_gw_id"] = df_XY["global_gw_id"].fillna(-1)
    df_XY["global_gw_id"] = df_XY["global_gw_id"].astype(int)
    global_scoring_gw = df_XY["global_gw_id"].max()
    global_test_gw = global_scoring_gw - 1
    df_XY_train = df_XY[df_XY["global_gw_id"] < global_test_gw].copy()
    df_XY_test = df_XY[df_XY["global_gw_id"] == global_test_gw].copy()
    df_XY_scoring = df_XY[df_XY["global_gw_id"] == global_scoring_gw].copy()

    # impute missing values in scoring df
    tbf_feats_next_1 = [feat + "_next_1" for feat in tbf_feats]
    tbf_feats_next_2 = [feat + "_next_2" for feat in tbf_feats]
    impute_feats = tbf_feats_next_1 + tbf_feats_next_2
    df_XY_scoring = df_XY_scoring.drop(columns=impute_feats)
    df_next_1_gw["gw_id"] = scoring_gw
    df_next_2_gw["gw_id"] = scoring_gw
    df_XY_scoring = pd.merge(df_XY_scoring,
                             df_next_1_gw,
                             how='left',
                             on=["player_id", "gw_id"])
    df_XY_scoring = pd.merge(df_XY_scoring,
                             df_next_2_gw,
                             how='left',
                             on=["player_id", "gw_id"])

    # save XY data
    df_XY_train.to_csv(os.path.join(dataset_dir,
                                    "xy_train_gw_{}.csv".format(scoring_gw)),
                       index=False)
    df_XY_test.to_csv(os.path.join(dataset_dir,
                                   "xy_test_gw_{}.csv".format(scoring_gw)),
                      index=False)
    df_XY_scoring.to_csv(os.path.join(
        dataset_dir, "xy_scoring_gw_{}.csv".format(scoring_gw)),
                         index=False)

    with open(os.path.join(dataset_dir, "features_after_fe.pkl"), 'wb') as f:
        pickle.dump(features_dict, f)