예제 #1
0
    def _get_unique_teams_from_datasets(self,
                                        df_train: pd.DataFrame,
                                        df_test: pd.DataFrame,
                                        df_predict: pd.DataFrame) -> None:
        """
        Gets unique teams names from dataframes for each dataset type.

        :param df_train: Train dataset.
        :param df_test: Test dataset.
        :param df_predict: Predict dataset.
        """
        self.train_teams = get_unique_teams(df_train)
        self.test_teams = get_unique_teams(df_test)
        self.predict_teams = get_unique_teams(df_predict)
        self.train_teams_exclusively = sorted(list(set(self.train_teams) - set(self.test_teams)))
예제 #2
0
    def run(self) -> None:
        """
        Runs training of the model for given number of epochs.

        """
        st = time.time()
        df_train, df_test, df_predict = self._preload()

        # Just load models and make predictions
        if self._predict:
            teams = spc.get_unique_teams(df_train)
            self._load_models(teams, include_optimizer=False)

            self.predict(df_test,
                         Dataset.Test,
                         revert_to_best_params=True,
                         restore_states_after_training=True)
            self.predict(df_predict, Dataset.Predict)
            self.save(predict=True)
        else:
            self.train(df_train, df_test)
            self.save(models=True, train=True, test=True)

            # Predict both test and predict datasets with the best params after training
            self.predict(df_test,
                         Dataset.Test,
                         revert_to_best_params=True,
                         restore_states_after_training=True)
            self.predict(df_predict, Dataset.Predict)
            self.save(predict=True)

        print(f"Run time: {((time.time()-st)/60):.2f} mins.")
예제 #3
0
    def load_and_process_fixtures_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Loads fixtures data ordered by date.

        The split guarantees that there will be at least N samples for predict dataset given by
        NPREDICT. Also, there will approx. the same number on test samples given by ntest argument.

        The exact number of required samples for test/predict dataset cannot be guaranteed due to
        different number of predict/test/discard samples because matches are not always played
        successively, so some teams can play more often within a certain period of time.

        Thus, in order to avoid overlapping datasets, it is probably impossible to ensure the exact
        numbers of samples when doing backtesting in general.

        :return: Train, test, and predict fixtures datasets.
        """
        df = self._dbmanager.query_fixtures_data(self._seasons)
        if df.empty:
            raise ValueError("Empty fixtures dataframe.")

        df = self._drop_last_season_championship_matches(df)

        self.teams = get_unique_teams(df)
        self.last_season_teams = get_last_season_unique_teams(df)
        # Get fixtures ids for each team
        teams_fixtures_ids = {t: df[(df["home"] == t) | (df["away"] == t)].loc[:, "id"].tolist() for t in self.teams}

        self._check_missing_columns(df)
        df = self._check_nan_values(df, teams_fixtures_ids)
        teams_fixtures_ids = self._discard_matches(df, teams_fixtures_ids)

        if not self._resume:
            # Use last n ids for predictions and last m ids for testing
            for t in self.last_season_teams:
                self.predict_fixtures_ids[t] = teams_fixtures_ids[t][-NPREDICT:]
                teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NPREDICT]

                self.test_fixtures_ids[t] = teams_fixtures_ids[t][-self._ntest:]
                teams_fixtures_ids[t] = teams_fixtures_ids[t][:-self._ntest]

            # Rest of ids is counted as train set
            self.train_fixtures_ids = teams_fixtures_ids
        else:
            if self.teams_names_bitlen != self._model_settings["teams_names_bitlen"]:
                raise ValueError("Current bitlength required to encode all teams names is higher than previous one.")

            # Check whether teams has not changed
            if self.teams != self._model_settings["teams"]:
                raise ValueError("Teams differ from previous run. \n"
                                 f"New: {self.teams} \n"
                                 f"Old: {self._model_settings['teams']}")

            if self.last_season_teams != self._model_settings["last_season_teams"]:
                raise ValueError("Last season teams differ from previous run. \n"
                                 f"New: {self.last_season_teams} \n"
                                 f"Old: {self._model_settings['last_season_teams']}")

            # Check whether fixtures ids match from previous run
            for t in self.last_season_teams:
                predict_fixtures_ids = teams_fixtures_ids[t][-NPREDICT:]
                teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NPREDICT]

                test_fixtures_ids = teams_fixtures_ids[t][-self._ntest:]
                teams_fixtures_ids[t] = teams_fixtures_ids[t][:-self._ntest]

                if predict_fixtures_ids != self._model_settings["predict_fixtures_ids"][t]:
                    raise ValueError(f"{t} predict fixtures ids differ from previous run. \n"
                                     f"New: {predict_fixtures_ids} \n"
                                     f"Old: {self._model_settings['predict_fixtures_ids'][t]}")
                if test_fixtures_ids != self._model_settings['test_fixtures_ids'][t]:
                    raise ValueError(f"{t} test fixtures ids differ from previous run. \n"
                                     f"New: {test_fixtures_ids} \n"
                                     f"Old: {self._model_settings['test_fixtures_ids'][t]}")
                if teams_fixtures_ids[t] != self._model_settings['train_fixtures_ids'][t]:
                    raise ValueError(f"{t} train fixtures ids differ from previous run. \n"
                                     f"New: {teams_fixtures_ids[t]} \n"
                                     f"Old: {self._model_settings['train_fixtures_ids'][t]}")

            # Checks passed, load previously saved data
            self.teams = self._model_settings["teams"]
            self.last_season_teams = self._model_settings["last_season_teams"]
            self.train_fixtures_ids = self._model_settings["train_fixtures_ids"]
            self.test_fixtures_ids = self._model_settings["test_fixtures_ids"]
            self.predict_fixtures_ids = self._model_settings["predict_fixtures_ids"]

        self._check_season_gaps_in_teams_matches(df)

        # Split original dataset into train, test, and predict datasets
        df_train, df_test, df_predict = self._mask_out_dataset(df)
        self._get_unique_teams_from_datasets(df_train, df_test, df_predict)

        self._check_changes_in_teams()
        self._count_samples(df_train, df_test, df_predict)

        return df_train, df_test, df_predict
예제 #4
0
    def predict(self,
                df: pd.DataFrame,
                predict_dataset: Dataset,
                revert_to_best_params: bool = False,
                restore_states_after_training: bool = False,
                verbose: bool = False) -> None:
        """
        Performs a single iteration of predict_on_batch for every sample in given dataset.
        Logic of setting weights is same as for training.

        :param df: Portion of data used for prediction.
        :param predict_dataset: Which type of dataset is used for prediction.
        :param revert_to_best_params: Whether to revert back to best weights.
        :param restore_states_after_training: Whether to restore states to moment after training.
        :param verbose: Whether to print matches predicting.
        """
        print(f"Predicting dataset: {predict_dataset.value}...")

        teams = spc.get_unique_teams(df)
        predict_metrics = defaultdict(lambda: defaultdict(list))

        for t in teams:
            self.models[t].matches_data[predict_dataset]["idx"] = 0
            # Use only best params for prediction
            if revert_to_best_params:
                self.models[t].snapshot.revert_to_best_params()
                self.models[t].revert_to_best_params(include_optimizer=False)
            if restore_states_after_training:
                self.models[t].snapshot.restore_states_after_training()
                self.models[t].restore_states_after_training()

        # Loop over matches
        for i, r in df.iterrows():
            if verbose and self._verbose > 0:
                print(
                    f"{i:04d}: {r['id']:04d} {r['date']} {r['season']:02d} {r['league']} {r['home']} {r['away']}"
                )
            team1 = r["home"]
            team2 = r["away"]
            team1_preds = None
            team2_preds = None

            self.models[team1].set_network_head2_params(team2)
            self.models[team2].set_network_head2_params(team1)

            team1_xinput, team1_yinput = self.models[team1].form_input(
                predict_dataset, self.models[team2])
            if (predict_dataset == Dataset.Predict
                    and team1_xinput) or (team1_xinput and team1_yinput):
                team1_preds = self.models[team1].network.predict_on_batch(
                    team1_xinput)
                self.models[team1].store_network_head2_states(team2)

            team2_xinput, team2_yinput = self.models[team2].form_input(
                predict_dataset, self.models[team1])
            if (predict_dataset == Dataset.Predict
                    and team2_xinput) or (team2_xinput and team2_yinput):
                team2_preds = self.models[team2].network.predict_on_batch(
                    team2_xinput)
                self.models[team2].store_network_head2_states(team1)

            emsg = "There are probably some missing data in the dataset."
            if team1_preds is None:
                raise ValueError(f"Predictions for model1 are nan. \n{emsg}")
            elif team2_preds is None:
                raise ValueError(f"Predictions for model2 are nan. \n{emsg}")

            # Log mew metrics
            predict_metrics = self._log_predict_metrics(
                predict_metrics,
                r,
                teams=(team1, team2),
                x_inputs=(team1_xinput, team2_xinput),
                y_inputs=(team1_yinput, team2_yinput),
                preds=(team1_preds, team2_preds))

            self.models[team1].matches_data[predict_dataset]["idx"] += 1
            self.models[team2].matches_data[predict_dataset]["idx"] += 1

        # Get max number of indices depending on length of datasets
        if predict_dataset == Dataset.Test:
            max_range = self.data_loader.max_ntest_len
        else:
            max_range = self.data_loader.max_npredict_len
        # Create stats file for prediction
        metrics = list(predict_metrics[teams[0]].keys())
        multiindex = pd.MultiIndex.from_product([teams, metrics],
                                                names=["team", "metric"])
        self.predictions[predict_dataset] = pd.DataFrame([],
                                                         index=range(
                                                             0, max_range),
                                                         columns=multiindex)

        # Save stats
        for t in teams:
            for m in metrics:
                self.predictions[predict_dataset].loc[
                    0:len(predict_metrics[t]),
                    (t, m)] = pd.Series(predict_metrics[t][m])
예제 #5
0
    def train(self, df_train: pd.DataFrame, df_test: pd.DataFrame) -> None:
        """
        Loops over train dataset for given number of epochs. Model's performance is evaluated
        against test dataset after each epoch.

        During each loop over matches within an epoch:
            1) Params of head2 for both current models are set - this needs to be done at the start to
               prevent using already updated weights when training second model.
            2) If input is correctly fetched then model of team1 is trained on the input and updated
               states of head2 are stored into snapshot. The same applies for the second model.
            3) Index to data is incremented for both models.
            4) Advance to next match and repeat.

        :param df_train: Portion of data used for training.
        :param df_test: Portion of data used for testing.
        """
        teams = spc.get_unique_teams(df_train)
        dataset = Dataset.Train
        # Load previously saved models if we continue in training
        # Optimizer is necessary for training
        if self._resume:
            self._load_models(teams, include_optimizer=True)

        for epoch in range(self._previous_epochs, self._total_epochs):
            st = time.time()
            print("---")
            print(f"Epoch: {epoch+1} of {self._total_epochs}")
            print("Training model...")
            # Verbose print only for first epoch
            verbose = (epoch == self._previous_epochs)
            train_metrics = defaultdict(lambda: defaultdict(list))
            # Reset states of RNNs at the beginning of the epoch, so they can be saved at the end
            # Also reset data index position
            for t in teams:
                self.models[t].snapshot.reset_states()
                self.models[t].network.reset_states()
                self.models[t].matches_data[dataset]["idx"] = 0

            # Loop over matches
            for i, r in df_train.iterrows():
                if verbose and self._verbose > 0:
                    print(
                        f"{i:04d}: {r['id']:04d} {r['date']} {r['season']:02d} {r['league']} {r['home']} {r['away']}"
                    )
                team1 = r["home"]
                team2 = r["away"]

                # Set team2 weights for both teams to avoid using newly changed weights of home team for
                # the away team and vice versa
                self.models[team1].set_network_head2_params(team2)
                self.models[team2].set_network_head2_params(team1)

                # Train home model
                x_input, y_input = self.models[team1].form_input(
                    dataset, self.models[team2])
                if x_input and y_input:
                    loss, acc = self.models[team1].train_on_batch(
                        x_input, y_input)
                    self.models[team1].store_network_head2_states(team2)

                    train_metrics[team1]["loss"].append(loss)
                    train_metrics[team1]["acc"].append(acc)

                # Train away model
                x_input, y_input = self.models[team2].form_input(
                    dataset, self.models[team1])
                if x_input and y_input:
                    loss, acc = self.models[team2].train_on_batch(
                        x_input, y_input)
                    self.models[team2].store_network_head2_states(team1)

                    train_metrics[team2]["loss"].append(loss)
                    train_metrics[team2]["acc"].append(acc)

                # Increment index to data
                self.models[team1].matches_data[dataset]["idx"] += 1
                self.models[team2].matches_data[dataset]["idx"] += 1

            # Track epochs passed
            self._total_epochs_passed += 1

            # Append metrics per current epoch
            for t in teams:
                self.models[t].snapshot.save_states_after_training()
                self.models[t].save_states_after_training()

                self.train_stats.loc[epoch, (t, "loss")] = np.mean(
                    train_metrics[t]["loss"])
                self.train_stats.loc[epoch, (t, "acc")] = np.mean(
                    train_metrics[t]["acc"])

            # Test models after every epoch
            self.test(df_test, epoch, verbose)

            # Call on epoch end processing
            self._on_epoch_end(epoch)

            # Measure training time and approx remaining time to finish
            et = time.time() - st
            self._runtimes_per_epoch.append(et)
            estimate = et * (self._total_epochs - self._total_epochs_passed)
            runtime = f"{estimate/60:.2f} mins" if epoch else "<inaccurate at first epoch>"
            print(
                f"Epoch took: {et:.2f} secs. Estimated time to finish: {runtime}"
            )
예제 #6
0
    def _preload(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Loads train, test, and predict teams data for each model, and build models.

        :return: Train, test, and predict datasets.
        """
        print("Loading data...")

        # Load fixtures for all three datasets
        df_train, df_test, df_predict = self.data_loader.load_and_process_fixtures_data(
        )

        # Fit scalers on train dataset only
        self.data_loader.fit_scalers(df_train)

        # Build models for all teams
        all_teams = spc.get_unique_teams(
            pd.concat([df_train, df_test, df_predict]))
        for t in all_teams:
            self.models[t] = SPModel(t, self.data_loader.test_teams,
                                     self.data_loader.teams_names_bitlen,
                                     f"{self._folder_prefix}")

        # Get fixtures ids where each team played in (separately for each dataset) and store them
        # Ids for test and predict datasets are properly aligned to fit match sequences
        for t in self.data_loader.train_teams:
            fixtures_ids = spc.get_fixtures_ids_from_df(df_train, t)
            team_matches_data = self.data_loader.load_and_process_team_data(
                Dataset.Train, self._teams_tuples[t], fixtures_ids)
            self.models[t].prepare_matches_data(Dataset.Train,
                                                team_matches_data)

            # Compute class weights for train dataset (remove last id from each team's fixtures
            # which will not be used for training to properly offset test dataset)
            self.models[t].compute_class_weights(team_matches_data,
                                                 fixtures_ids[:-1],
                                                 verbose=False)

        for t in self.data_loader.test_teams:
            fixtures_ids = spc.get_fixtures_ids_from_df(df_test, t)
            aligned_fixtures_ids = spc.align_fixtures_ids(
                df_train, t, fixtures_ids, self._timesteps)
            team_matches_data = self.data_loader.load_and_process_team_data(
                Dataset.Test, self._teams_tuples[t], aligned_fixtures_ids)
            self.models[t].prepare_matches_data(Dataset.Test,
                                                team_matches_data)

        for t in self.data_loader.predict_teams:
            combined_df_train = pd.concat((df_train, df_test),
                                          ignore_index=True)
            fixtures_ids = spc.get_fixtures_ids_from_df(df_predict, t)
            # Use combined train+test dataset in case that there would be less test samples than timesteps
            # so the rest of sequence can be filled from train dataset
            aligned_fixtures_ids = spc.align_fixtures_ids(
                combined_df_train, t, fixtures_ids, self._timesteps)
            team_matches_data = self.data_loader.load_and_process_team_data(
                Dataset.Predict, self._teams_tuples[t], aligned_fixtures_ids)
            self.models[t].prepare_matches_data(Dataset.Predict,
                                                team_matches_data)

        # Assemble network for each model
        print(f"Assembling {len(self.models)} models...")
        t0 = all_teams[0]
        self.models[t0].build_model()

        for t in all_teams[1:]:
            if RANDOM_WEIGHTS and not self._resume:
                self.models[t].build_model()
            else:
                self.models[t].build_model_from(self.models[t0])

        if not self._resume:
            self._set_default_snapshots()
            self._create_stats_files()

        # Reset indices of dfs
        df_train.reset_index(inplace=True, drop=True)
        df_test.reset_index(inplace=True, drop=True)
        df_predict.reset_index(inplace=True, drop=True)

        return df_train, df_test, df_predict