示例#1
0
    def interactions_data_frame(self) -> pd.DataFrame:
        if not hasattr(self, "_interactions_data_frame"):
            data = pd.concat(
                [
                    pd.read_csv(self.train_data_frame_path),
                    pd.read_csv(self.val_data_frame_path),
                ],
                ignore_index=True,
            )
            if self.sample_size > 0:
                data = data[-self.sample_size:]

            self._interactions_data_frame = preprocess_interactions_data_frame(
                data,
                self.project_config,
            )
            self._interactions_data_frame.sort_values(
                self.project_config.timestamp_column_name).reset_index(
                    drop=True)

        # Needed in case index_mapping was invoked before
        if not hasattr(self, "_creating_index_mapping") and not hasattr(
                self, "_interactions_data_frame_indexed"):
            transform_with_indexing(self._interactions_data_frame,
                                    self.index_mapping, self.project_config)
            self._interactions_data_frame_indexed = True
        return self._interactions_data_frame
示例#2
0
    def _direct_estimator_predict(self, df):
        _df = preprocess_interactions_data_frame(
            df.copy(), self.direct_estimator.project_config)
        transform_with_indexing(_df, self.direct_estimator.index_mapping,
                                self.direct_estimator.project_config)

        dataset = InteractionsDataset(
            data_frame=_df,
            embeddings_for_metadata=self.direct_estimator.
            embeddings_for_metadata,
            project_config=self.direct_estimator.project_config,
            index_mapping=self.direct_estimator.index_mapping)
        batch_sampler = FasterBatchSampler(dataset,
                                           self.direct_estimator.batch_size,
                                           shuffle=False)
        data_loader = NoAutoCollationDataLoader(dataset,
                                                batch_sampler=batch_sampler)

        trial = (Trial(
            self.direct_estimator.get_trained_module(),
            criterion=lambda *args: torch.zeros(1,
                                                device=self.direct_estimator.
                                                torch_device,
                                                requires_grad=True),
        ).with_generators(val_generator=data_loader).to(
            self.direct_estimator.torch_device).eval())

        with torch.no_grad():
            rewards_tensor: torch.Tensor = trial.predict(
                verbose=0, data_key=torchbearer.VALIDATION_DATA)
        rewards: np.ndarray = rewards_tensor[:, 0].cpu().numpy()

        return rewards
示例#3
0
    def run(self):
        os.makedirs(self.output().path)
        df: pd.DataFrame = pd.read_csv(self.input().path)

        # Index test dataset
        df['Index'] = df['SessionID']
        print(df.head())

        df = preprocess_interactions_data_frame(
            df, self.model_training.project_config)
        transform_with_indexing(df, self.model_training.index_mapping,
                                self.model_training.project_config)
        df = df.sort_values("Index")

        print(df.head())
        print(df.shape)

        generator = self.get_test_generator(df)

        # Gente Model
        model = self.model_training.get_trained_module()
        model.to(self.torch_device)
        model.eval()

        scores = []
        rank_list = []
        #with Pool(os.cpu_count()) as p:
        #    list(tqdm(p.starmap(_get_rank_list, )))

        reverse_index_mapping = self.model_training.reverse_index_mapping[
            'ItemID']
        reverse_index_mapping[1] = 0

        # Inference
        with torch.no_grad():
            for i, (x, _) in tqdm(enumerate(generator), total=len(generator)):

                input_params = x if isinstance(x, list) or isinstance(
                    x, tuple) else [x]
                input_params = [t.to(self.torch_device) for t in input_params]

                scores_tensor: torch.Tensor = model(*input_params)
                scores_batch = scores_tensor.detach().cpu().numpy()
                #scores.extend(scores_batch)

                for score in tqdm(scores_batch, total=len(scores_batch)):
                    item_idx = np.argsort(score)[::-1][:10]
                    #from IPython import embed; embed()
                    item_id = [
                        int(reverse_index_mapping[item]) for item in item_idx
                    ]
                    rank_list.append(item_id)

                gc.collect()
        np.savetxt(self.output().path +
                   '/submission_{}.csv'.format(self.task_name),
                   np.array(rank_list).astype(int),
                   fmt='%i',
                   delimiter=',')
示例#4
0
    def index_mapping(self) -> Dict[str, Dict[Any, int]]:
        if not hasattr(self, "_index_mapping"):
            print("index_mapping...")

            self._creating_index_mapping = True
            df = preprocess_interactions_data_frame(
                self.get_data_frame_for_indexing(), self.project_config)

            if os.path.exists(self.index_mapping_path):
                with open(self.index_mapping_path, "rb") as f:
                    self._index_mapping = pickle.load(f)
                #del self._creating_index_mapping
            else:
                self._index_mapping = {}

            keys_in_map = list(self._index_mapping.keys())
            project_all_columns = [
                c for c in self.project_config.all_columns
                if c.name not in keys_in_map
            ]

            print("indexing project_all_columns...")
            for column in project_all_columns:
                if column.type == IOType.INDEXABLE and not column.same_index_as:
                    self._index_mapping[column.name] = create_index_mapping(
                        df[column.name].values)

            # self._index_mapping = {
            #     column.name: create_index_mapping(df[column.name].values)
            #     for column in project_all_columns
            #     if column.type == IOType.INDEXABLE and not column.same_index_as
            # }
            print("indexing create_index_mapping_from_arrays...")
            self._index_mapping.update({
                column.name:
                create_index_mapping_from_arrays(df[column.name].values)
                for column in project_all_columns
                if column.type == IOType.INDEXABLE_ARRAY
                and not column.same_index_as
            })

            print("indexing same_index_as...")
            for column in project_all_columns:
                if column.same_index_as:
                    self._index_mapping[column.name] = self._index_mapping[
                        column.same_index_as]

            del self._creating_index_mapping
            del df

            with open(get_index_mapping_path(self.output().path), "wb") as f:
                pickle.dump(self._index_mapping, f)

        return self._index_mapping
示例#5
0
    def run(self):
        os.makedirs(self.output().path)

        df: pd.DataFrame = pd.read_parquet(self.input()[1].path)

        df = preprocess_interactions_data_frame(
            df, self.model_training.project_config)

        data = SessionInteractionDataFrame(
            item_column="",
            normalize_dense_features=self.normalize_dense_features,
            normalize_file_path=self.normalize_file_path)

        data.transform_data_frame(df, "TEST_GENERATOR")

        # Index test dataset
        df['Index'] = df['SessionID'].astype(int)
        df = df.sort_values("Index")

        df.to_csv(self.output().path + "/dataset.csv")

        transform_with_indexing(df, self.model_training.index_mapping,
                                self.model_training.project_config)
        #
        df.to_csv(self.output().path + "/dataset_indexed.csv")
        generator = self.get_test_generator(df)

        print(df.head())
        print(df.shape)

        reverse_index_mapping = self.model_training.reverse_index_mapping[
            'ItemID']
        reverse_index_mapping[1] = 0

        if self.model_eval == "model":
            rank_list = self.model_rank_list(generator, reverse_index_mapping)
        elif self.model_eval == "most_popular":
            rank_list = self.most_popular_rank_list(generator,
                                                    reverse_index_mapping)
        elif self.model_eval == "coocorrence":
            rank_list = self.coocorrence_rank_list(generator,
                                                   reverse_index_mapping)

        df_moda = self.pos_process(rank_list)
        rank_list = df_moda['reclist_2'].values
        rank_list = np.array([np.array(r).astype(int) for r in rank_list])

        #
        df_moda.to_csv(self.output().path + '/df_submission.csv', index=False)
        np.savetxt(self.output().path +
                   '/submission_{}.csv'.format(self.task_name),
                   rank_list,
                   fmt='%i',
                   delimiter=',')
示例#6
0
    def test_data_frame(self) -> pd.DataFrame:
        if not hasattr(self, "_test_data_frame"):
            print("test_data_frame:")
            self._test_data_frame = preprocess_interactions_data_frame(
                pd.read_csv(self.test_data_frame_path,
                            usecols=self.dataset_read_columns),
                self.project_config)

            transform_with_indexing(self._test_data_frame, self.index_mapping,
                                    self.project_config)

        return self._test_data_frame
示例#7
0
    def val_data_frame(self) -> pd.DataFrame:
        if not hasattr(self, "_val_data_frame"):
            print("val_data_frame:")
            self._val_data_frame = preprocess_interactions_data_frame(
                pd.read_csv(
                    self.val_data_frame_path,
                    usecols=[c.name for c in self.project_config.all_columns]),
                self.project_config)

            transform_with_indexing(self._val_data_frame, self.index_mapping,
                                    self.project_config)

        return self._val_data_frame
示例#8
0
 def test_data_frame(self) -> pd.DataFrame:
     if not hasattr(self, "_test_data_frame"):
         self._test_data_frame = preprocess_interactions_data_frame(
             pd.read_csv(self.test_data_frame_path), self.project_config
         )
     # Needed in case index_mapping was invoked before
     if not hasattr(self, "_creating_index_mapping") and not hasattr(
         self, "_test_data_frame_indexed"
     ):
         transform_with_indexing(
             self._test_data_frame, self.index_mapping, self.project_config
         )
         self._test_data_frame_indexed = True
     return self._test_data_frame
示例#9
0
    def fill_ps(self, df: pd.DataFrame, pool: Pool):
        policy_estimator_df = preprocess_interactions_data_frame(df.copy(), self.policy_estimator.project_config)
        transform_with_indexing(
            policy_estimator_df,
            self.policy_estimator.index_mapping,
            self.policy_estimator.project_config,
        )

        if self.available_arms_column:
            policy_estimator_df[self.available_arms_column] = policy_estimator_df[
                self.available_arms_column
            ].map(
                functools.partial(
                    map_array,
                    mapping=self.policy_estimator.index_mapping[
                        self.policy_estimator.project_config.item_column.name
                    ],
                )
            )

        dataset = InteractionsDataset(
            data_frame=policy_estimator_df,
            embeddings_for_metadata=self.policy_estimator.embeddings_for_metadata,
            project_config=self.policy_estimator.project_config,
            index_mapping=self.policy_estimator.index_mapping
        )
        batch_sampler = FasterBatchSampler(
            dataset, self.policy_estimator.batch_size, shuffle=False
        )
        data_loader = NoAutoCollationDataLoader(dataset, batch_sampler=batch_sampler)
        #from IPython import embed
        #embed()
        trial = (
            Trial(
                self.policy_estimator.get_trained_module(),
                criterion=lambda *args: torch.zeros(
                    1, device=self.policy_estimator.torch_device, requires_grad=True
                ),
            )
            .with_generators(val_generator=data_loader)
            .to(self.policy_estimator.torch_device)
            .eval()
        )

        with torch.no_grad():
            log_probas: torch.Tensor = trial.predict(
                verbose=0, data_key=torchbearer.VALIDATION_DATA
            )
        probas: np.ndarray = torch.exp(log_probas).cpu().numpy()

        item_indices = policy_estimator_df[self.item_column]

        params = (
            zip(item_indices, probas, policy_estimator_df[self.available_arms_column])
            if self.available_arms_column
            else zip(item_indices, probas)
        )


        # from IPython import embed; embed()
        df[self.propensity_score_column] = list(
            tqdm(pool.starmap(_get_ps_from_probas, params), total=len(df))
        )