def interactions_data_frame(self) -> pd.DataFrame: if not hasattr(self, "_interactions_data_frame"): data = pd.concat( [ pd.read_csv(self.train_data_frame_path), pd.read_csv(self.val_data_frame_path), ], ignore_index=True, ) if self.sample_size > 0: data = data[-self.sample_size:] self._interactions_data_frame = preprocess_interactions_data_frame( data, self.project_config, ) self._interactions_data_frame.sort_values( self.project_config.timestamp_column_name).reset_index( drop=True) # Needed in case index_mapping was invoked before if not hasattr(self, "_creating_index_mapping") and not hasattr( self, "_interactions_data_frame_indexed"): transform_with_indexing(self._interactions_data_frame, self.index_mapping, self.project_config) self._interactions_data_frame_indexed = True return self._interactions_data_frame
def _direct_estimator_predict(self, df): _df = preprocess_interactions_data_frame( df.copy(), self.direct_estimator.project_config) transform_with_indexing(_df, self.direct_estimator.index_mapping, self.direct_estimator.project_config) dataset = InteractionsDataset( data_frame=_df, embeddings_for_metadata=self.direct_estimator. embeddings_for_metadata, project_config=self.direct_estimator.project_config, index_mapping=self.direct_estimator.index_mapping) batch_sampler = FasterBatchSampler(dataset, self.direct_estimator.batch_size, shuffle=False) data_loader = NoAutoCollationDataLoader(dataset, batch_sampler=batch_sampler) trial = (Trial( self.direct_estimator.get_trained_module(), criterion=lambda *args: torch.zeros(1, device=self.direct_estimator. torch_device, requires_grad=True), ).with_generators(val_generator=data_loader).to( self.direct_estimator.torch_device).eval()) with torch.no_grad(): rewards_tensor: torch.Tensor = trial.predict( verbose=0, data_key=torchbearer.VALIDATION_DATA) rewards: np.ndarray = rewards_tensor[:, 0].cpu().numpy() return rewards
def run(self): os.makedirs(self.output().path) df: pd.DataFrame = pd.read_csv(self.input().path) # Index test dataset df['Index'] = df['SessionID'] print(df.head()) df = preprocess_interactions_data_frame( df, self.model_training.project_config) transform_with_indexing(df, self.model_training.index_mapping, self.model_training.project_config) df = df.sort_values("Index") print(df.head()) print(df.shape) generator = self.get_test_generator(df) # Gente Model model = self.model_training.get_trained_module() model.to(self.torch_device) model.eval() scores = [] rank_list = [] #with Pool(os.cpu_count()) as p: # list(tqdm(p.starmap(_get_rank_list, ))) reverse_index_mapping = self.model_training.reverse_index_mapping[ 'ItemID'] reverse_index_mapping[1] = 0 # Inference with torch.no_grad(): for i, (x, _) in tqdm(enumerate(generator), total=len(generator)): input_params = x if isinstance(x, list) or isinstance( x, tuple) else [x] input_params = [t.to(self.torch_device) for t in input_params] scores_tensor: torch.Tensor = model(*input_params) scores_batch = scores_tensor.detach().cpu().numpy() #scores.extend(scores_batch) for score in tqdm(scores_batch, total=len(scores_batch)): item_idx = np.argsort(score)[::-1][:10] #from IPython import embed; embed() item_id = [ int(reverse_index_mapping[item]) for item in item_idx ] rank_list.append(item_id) gc.collect() np.savetxt(self.output().path + '/submission_{}.csv'.format(self.task_name), np.array(rank_list).astype(int), fmt='%i', delimiter=',')
def index_mapping(self) -> Dict[str, Dict[Any, int]]: if not hasattr(self, "_index_mapping"): print("index_mapping...") self._creating_index_mapping = True df = preprocess_interactions_data_frame( self.get_data_frame_for_indexing(), self.project_config) if os.path.exists(self.index_mapping_path): with open(self.index_mapping_path, "rb") as f: self._index_mapping = pickle.load(f) #del self._creating_index_mapping else: self._index_mapping = {} keys_in_map = list(self._index_mapping.keys()) project_all_columns = [ c for c in self.project_config.all_columns if c.name not in keys_in_map ] print("indexing project_all_columns...") for column in project_all_columns: if column.type == IOType.INDEXABLE and not column.same_index_as: self._index_mapping[column.name] = create_index_mapping( df[column.name].values) # self._index_mapping = { # column.name: create_index_mapping(df[column.name].values) # for column in project_all_columns # if column.type == IOType.INDEXABLE and not column.same_index_as # } print("indexing create_index_mapping_from_arrays...") self._index_mapping.update({ column.name: create_index_mapping_from_arrays(df[column.name].values) for column in project_all_columns if column.type == IOType.INDEXABLE_ARRAY and not column.same_index_as }) print("indexing same_index_as...") for column in project_all_columns: if column.same_index_as: self._index_mapping[column.name] = self._index_mapping[ column.same_index_as] del self._creating_index_mapping del df with open(get_index_mapping_path(self.output().path), "wb") as f: pickle.dump(self._index_mapping, f) return self._index_mapping
def run(self): os.makedirs(self.output().path) df: pd.DataFrame = pd.read_parquet(self.input()[1].path) df = preprocess_interactions_data_frame( df, self.model_training.project_config) data = SessionInteractionDataFrame( item_column="", normalize_dense_features=self.normalize_dense_features, normalize_file_path=self.normalize_file_path) data.transform_data_frame(df, "TEST_GENERATOR") # Index test dataset df['Index'] = df['SessionID'].astype(int) df = df.sort_values("Index") df.to_csv(self.output().path + "/dataset.csv") transform_with_indexing(df, self.model_training.index_mapping, self.model_training.project_config) # df.to_csv(self.output().path + "/dataset_indexed.csv") generator = self.get_test_generator(df) print(df.head()) print(df.shape) reverse_index_mapping = self.model_training.reverse_index_mapping[ 'ItemID'] reverse_index_mapping[1] = 0 if self.model_eval == "model": rank_list = self.model_rank_list(generator, reverse_index_mapping) elif self.model_eval == "most_popular": rank_list = self.most_popular_rank_list(generator, reverse_index_mapping) elif self.model_eval == "coocorrence": rank_list = self.coocorrence_rank_list(generator, reverse_index_mapping) df_moda = self.pos_process(rank_list) rank_list = df_moda['reclist_2'].values rank_list = np.array([np.array(r).astype(int) for r in rank_list]) # df_moda.to_csv(self.output().path + '/df_submission.csv', index=False) np.savetxt(self.output().path + '/submission_{}.csv'.format(self.task_name), rank_list, fmt='%i', delimiter=',')
def test_data_frame(self) -> pd.DataFrame: if not hasattr(self, "_test_data_frame"): print("test_data_frame:") self._test_data_frame = preprocess_interactions_data_frame( pd.read_csv(self.test_data_frame_path, usecols=self.dataset_read_columns), self.project_config) transform_with_indexing(self._test_data_frame, self.index_mapping, self.project_config) return self._test_data_frame
def val_data_frame(self) -> pd.DataFrame: if not hasattr(self, "_val_data_frame"): print("val_data_frame:") self._val_data_frame = preprocess_interactions_data_frame( pd.read_csv( self.val_data_frame_path, usecols=[c.name for c in self.project_config.all_columns]), self.project_config) transform_with_indexing(self._val_data_frame, self.index_mapping, self.project_config) return self._val_data_frame
def test_data_frame(self) -> pd.DataFrame: if not hasattr(self, "_test_data_frame"): self._test_data_frame = preprocess_interactions_data_frame( pd.read_csv(self.test_data_frame_path), self.project_config ) # Needed in case index_mapping was invoked before if not hasattr(self, "_creating_index_mapping") and not hasattr( self, "_test_data_frame_indexed" ): transform_with_indexing( self._test_data_frame, self.index_mapping, self.project_config ) self._test_data_frame_indexed = True return self._test_data_frame
def fill_ps(self, df: pd.DataFrame, pool: Pool): policy_estimator_df = preprocess_interactions_data_frame(df.copy(), self.policy_estimator.project_config) transform_with_indexing( policy_estimator_df, self.policy_estimator.index_mapping, self.policy_estimator.project_config, ) if self.available_arms_column: policy_estimator_df[self.available_arms_column] = policy_estimator_df[ self.available_arms_column ].map( functools.partial( map_array, mapping=self.policy_estimator.index_mapping[ self.policy_estimator.project_config.item_column.name ], ) ) dataset = InteractionsDataset( data_frame=policy_estimator_df, embeddings_for_metadata=self.policy_estimator.embeddings_for_metadata, project_config=self.policy_estimator.project_config, index_mapping=self.policy_estimator.index_mapping ) batch_sampler = FasterBatchSampler( dataset, self.policy_estimator.batch_size, shuffle=False ) data_loader = NoAutoCollationDataLoader(dataset, batch_sampler=batch_sampler) #from IPython import embed #embed() trial = ( Trial( self.policy_estimator.get_trained_module(), criterion=lambda *args: torch.zeros( 1, device=self.policy_estimator.torch_device, requires_grad=True ), ) .with_generators(val_generator=data_loader) .to(self.policy_estimator.torch_device) .eval() ) with torch.no_grad(): log_probas: torch.Tensor = trial.predict( verbose=0, data_key=torchbearer.VALIDATION_DATA ) probas: np.ndarray = torch.exp(log_probas).cpu().numpy() item_indices = policy_estimator_df[self.item_column] params = ( zip(item_indices, probas, policy_estimator_df[self.available_arms_column]) if self.available_arms_column else zip(item_indices, probas) ) # from IPython import embed; embed() df[self.propensity_score_column] = list( tqdm(pool.starmap(_get_ps_from_probas, params), total=len(df)) )