def inner(*args, **kwargs): nonlocal self, path_func read_from_cache = kwargs.pop('read_from_cache', False) save_to_cache = kwargs.pop('save_to_cache', True) cache_valid_days = kwargs.pop('cache_valid_days', None) if not read_from_cache and not save_to_cache: # short circuit everything if cache not requested return func(*args, **kwargs) path_func = path_func or self.cache_filepath cache_path = path_func(*args, **kwargs) cache_valid = self.is_cache_valid(cache_path, valid_days=cache_valid_days) read_cache_attempt = read_from_cache and cache_valid # using pickle here because pickling stores the dataframe more reliably # (data types and other information may have changed or lost during write/read of csv) if read_cache_attempt: # df = pd.read_sv(cache_path, keep_default_na=False, na_values=NA_VALUES) df = pd.read_pickle(cache_path) logger.info(f'Read cache file from {cache_path}') else: if read_from_cache: logger.warning(f'Cache file not found/valid, attempting to create ({cache_path})') df = func(*args, **kwargs) if save_to_cache and cache_path and not read_cache_attempt: # df.to_csv(cache_path, index=None) df.to_pickle(cache_path) return df
def remote_to_local(self, remote_path, local_path, overwrite=True): if not os.path.exists(local_path) or overwrite: os.makedirs(os.path.dirname(local_path), exist_ok=True) logger.info('S3: copying from %s to %s' % (remote_path, local_path)) with open(local_path, 'wb') as local: local.write(self.read(remote_path))
def test_c_als_recommender(self): from ml_recsys_tools.recommenders.implib_recommenders import ALSRecommender als_rec = ALSRecommender() als_rec.fit(self.state.train_obs) als_rep = als_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='als ') logger.info(als_rep) self._test_recommender(als_rec)
def test_d_comb_rank_ens(self): from ml_recsys_tools.recommenders.combination_ensembles import CombinedRankEnsemble comb_ranks_rec = CombinedRankEnsemble( recommenders=[self.state.lfm_rec, self.state.item_cooc_rec]) comb_rank_rep = comb_ranks_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='combined ranks ') logger.info(comb_rank_rep) self._test_recommender(comb_ranks_rec)
def test_d_comb_simil_ens(self): from ml_recsys_tools.recommenders.combination_ensembles import CombinedSimilRecoEns comb_simil_rec = CombinedSimilRecoEns( recommenders=[self.state.lfm_rec, self.state.item_cooc_rec]) comb_simil_rec.fit(self.state.train_obs) comb_simil_rep = comb_simil_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='combined simils ') logger.info(comb_simil_rep) self._test_recommender(comb_simil_rec)
def _block_until_first_load_loop(self): wait_sec = 0 while self.keep_reloading and (self.model is None): if wait_sec == 0 or (wait_sec % 10) == 0: logger.info( 'Blocking until first model is loaded (%d seconds already).' % wait_sec) time.sleep(1) wait_sec += 1
def _filter_array(array, encoder, message_prefix='', message_suffix=''): array = np.array(array).astype(str) new_labels_mask = encoder.find_new_labels(array) n_discard = np.sum(new_labels_mask) if n_discard > 0: logger.info( '%s Discarding %d (out of %d) %s' % (message_prefix, int(n_discard), len(array), message_suffix)) return array[~new_labels_mask]
def test_c_cooc_recommender(self): from ml_recsys_tools.recommenders.similarity_recommenders import ItemCoocRecommender item_cooc_rec = ItemCoocRecommender() item_cooc_rec.fit(self.state.train_obs) item_cooc_rep = item_cooc_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='item cooccurrence ') logger.info(item_cooc_rep) self._test_recommender(item_cooc_rec) self.state.item_cooc_rec = item_cooc_rec
def _download_through_disk(self, remote_path, local_fileobj): with tempfile.NamedTemporaryFile(delete=True) as temp: self._stream_obj_to_file(remote_path=remote_path, fileobj=temp) try: with gzip.open(temp) as gzipfile: with io.BufferedReader(gzipfile) as gzipbuffered: return self._stream_to_file(gzipbuffered, local_fileobj) except Exception as e: logger.info('_download_through_disk: failed gzip read, assuming regular binary') temp.seek(0) return self._stream_to_file(temp, local_fileobj)
def test_c_spotlight_implicit_recommender(self): from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender rec = EmbeddingFactorsRecommender() # trying to balance flakiness and speed rec.set_params(embedding_dim=32, batch_size=1<<10, num_negative_samples=10, n_iter=5) rec.fit(self.state.train_obs) report = rec.eval_on_test_by_ranking(self.state.test_obs, prefix='spot ') logger.info(report) self._test_recommender(rec)
def test_c_features_simil_recommender(self): from ml_recsys_tools.recommenders.similarity_recommenders import FeaturesSimilRecommender cos_rec = FeaturesSimilRecommender() cos_rec.fit(self.state.train_obs) cos_rep = cos_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='cosine ') logger.info(cos_rep) # not using _test_recommender because this recommender will fail on fake_data_test and exclusion_test self._test_get_recommendations(cos_rec) self._test_get_similar_items(cos_rec) self._test_predict_for_user(cos_rec)
def _test_predictions_on_fake_data(self, rec): # check that missing "interactions" are recommended for user in self.TESTING_USER_IDS: recos = rec.get_recommendations(user_ids=[user], n_rec=10).iloc[0][rec._item_col] logger.info(f'{user} {recos}') self.assertTrue(user.replace('user', 'item') in recos) # check that test items are similar to each other for item in self.TESTING_ITEM_IDS: simils = rec.get_similar_items(item_ids=[item], n_simil=10).iloc[0][rec._item_col] logger.info(f'{item} {simils}') self.assertTrue(len(set(simils).intersection(set(self.TESTING_ITEM_IDS))) >= 3)
def early_stopping_runner( score_func, check_point_func, epochs_start=0, epochs_max=200, epochs_step=10, stop_patience=10, decline_threshold=0.05, plot_graph=True): res_list = [] max_score = 0 decline_counter = 0 cur_epoch = 0 epochs_list = [] max_epoch = 0 # find optimal number of epochs on validation data while cur_epoch <= epochs_max: cur_step = epochs_start + epochs_step if cur_epoch == 0 else epochs_step simple_logger.info('Training epochs %d - %d.' % (cur_epoch, cur_epoch + cur_step)) cur_epoch += cur_step epochs_list.append(cur_epoch) cur_score = score_func(cur_epoch, cur_step) res_list.append(cur_score) # early stopping logic if max_score * (1 - decline_threshold) > cur_score: decline_counter += cur_step if decline_counter >= stop_patience: break else: decline_counter = 0 if cur_score > max_score: max_score = cur_score max_epoch = cur_epoch check_point_func() # print logging info scores_str = ','.join(['%d%%(%d)' % (int(100 * s / max_score), e) for s, e in zip(res_list, epochs_list)]) simple_logger.info('Early stopping: stopped fit after %d ' 'epochs (max validation score: %f (@%d), all scores: %s)' % (cur_epoch, max_score, max_epoch, scores_str)) if plot_graph: pyplot.figure() pyplot.plot(epochs_list, res_list) return max_epoch
def remove_unseen_labels(self, df): # new_u = ~df[self.uid_source_col].isin(self.uid_encoder.classes_) new_u = self.uid_encoder.find_new_labels(df[self.uid_source_col]) # new_i = ~df[self.iid_source_col].isin(self.iid_encoder.classes_) new_i = self.iid_encoder.find_new_labels(df[self.iid_source_col]) percent_new_u = np.mean(new_u) percent_new_i = np.mean(new_i) if percent_new_u > 0.0 or percent_new_i > 0.0: logger.info( 'Discarding %.1f%% samples with unseen ' 'users(%d) / unseen items(%d) from DF(len: %s).' % \ (100 * np.mean(new_u | new_i), np.sum(new_u), np.sum(new_i), len(df))) return df[~new_u & ~new_i].copy() else: return df
def _filter_relevant_obs_and_items(self, stage=''): items_ids = self.df_items[self.item_id_col].unique().astype(str) obs_ids = self.df_obs[self.iid_col].unique().astype(str) obs_filt = self.df_obs[self.iid_col].astype(str).isin(items_ids) item_filt = self.df_items[self.item_id_col].astype(str).isin(obs_ids) self.df_obs = self.df_obs[obs_filt].copy() self.df_items = self.df_items[item_filt].copy() n_dropped_obs = (~obs_filt).sum() n_dropped_items = (~item_filt).sum() if n_dropped_obs + n_dropped_items: logger.info('ObsWithFeatures:_filter_relevant_obs_and_items:%s ' 'dropped %d observations, %d items' % (stage, n_dropped_obs, n_dropped_items))
def _test_predict_for_user(self, rec): user = rec.all_users[0] items = rec.all_items[:50] ts = time.time() preds_1 = rec.predict_for_user(user_id=user, item_ids=items) elapsed = time.time() - ts scores = preds_1[rec._prediction_col].tolist() # test format # columns self.assertListEqual(preds_1.columns.tolist(), [rec._user_col, rec._item_col, rec._prediction_col]) # length self.assertEqual(len(preds_1), len(items)) # test sorted descending self.assertTrue(scores[::-1] == sorted(scores)) # test combine with original order makes first item in original order higher in results preds_2 = rec.predict_for_user(user_id=user, item_ids=items, combine_original_order=True) ind_item = lambda item, preds: np.argmax(preds[rec._item_col].values == item) ind_diffs = np.array([ind_item(item, preds_1) - ind_item(item, preds_2) for item in items]) self.assertEqual(ind_diffs.sum(), 0) self.assertGreater(ind_diffs[:(len(ind_diffs) // 2)].sum(), 0) # first items rank higher # test training items predictions are last train_item = rec.item_ids([rec.train_mat[rec.user_inds([user])[0],:].indices[0]]) preds_3 = rec.predict_for_user(user_id=user, item_ids=np.concatenate([items, train_item])) train_preds = preds_3[preds_3[rec._item_col] == train_item[0]][rec._prediction_col] self.assertTrue(all(train_preds == preds_3[rec._prediction_col].min())) # test unknown items are last new_items = 'new_item' preds_4 = rec.predict_for_user(user_id=user, item_ids=np.concatenate([items, [new_items]])) new_preds = preds_4[preds_4[rec._item_col] == new_items][rec._prediction_col] self.assertTrue(all(new_preds == preds_4[rec._prediction_col].min())) # test for unknown user all predictions are the same preds_5 = rec.predict_for_user(user_id='new_user', item_ids=items) self.assertEqual(preds_5[rec._prediction_col].min(), preds_5[rec._prediction_col].max()) # test doesn't take more than 0.05 second logger.info(f'predict_for_user for {rec} took {elapsed:.3f} seconds.') self.assertGreater(0.06 * (1 + 2 * int(DEBUG_ON)), elapsed) # allow more time if debugging
def _rank_items_for_user(cls, model: BaseDFSparseRecommender, user_id, item_ids, mode, rank_training_last=True, min_score=None): ts = time.time() n_unknowns = 0 if mode == cls.mode_disabled: scores = [None] * len(item_ids) else: pred_df = model.predict_for_user( user_id=user_id, item_ids=item_ids, rank_training_last=rank_training_last, sort=True, combine_original_order=cls._combine_original_order(mode), ) item_ids = pred_df[model._item_col].tolist() scores = pred_df[model._prediction_col].values if min_score is not None: unknowns_mask = scores < min_score n_unknowns = unknowns_mask.sum() # is a numpy array scores[unknowns_mask] = min_score scores = scores.tolist() result = { 'user_id': user_id, 'ranked_items': item_ids, 'scores': scores } logger.info( 'Ran ranking for user %s (%d items, %d unknown) in %.3f seconds for mode %s.' % (str(user_id), len(scores), n_unknowns, time.time() - ts, str(mode))) return result
def _model_reloading_loop(self): time.sleep(self._time_jitter()) while self.keep_reloading: try: new_model_s3_path = self._latest_s3_model_path() if new_model_s3_path == self._current_model_path: logger.info('Model path unchanged, not reloading. %s' % new_model_s3_path) else: updated_model = S3FileIO( self._s3_bucket).unpickle(new_model_s3_path) self._test_loaded_model(updated_model) self.model = updated_model self._current_model_path = new_model_s3_path logger.info('Loaded updated model from S3. %s' % new_model_s3_path) except Exception as e: logger.error('Failed model update. %s' % str(e)) logger.exception(e) if self.model is None: raise EnvironmentError('Could not load model on startup.') time.sleep(self._update_interval_seconds + self._time_jitter())
def fit_with_early_stop(self, train_obs, valid_ratio=0.04, refit_on_all=False, metric='AUC', epochs_start=0, epochs_max=200, epochs_step=10, stop_patience=10, plot_convergence=True, decline_threshold=0.05, k=10, valid_split_time_col=None): # split validation data train_obs_internal, valid_obs = train_obs.split_train_test( ratio=valid_ratio ** 0.5 if valid_split_time_col is None else valid_ratio, users_ratio=valid_ratio ** 0.5 if valid_split_time_col is None else 1, time_split_column=valid_split_time_col, random_state=RANDOM_STATE) self.model = None self.model_checkpoint = None all_metrics = pd.DataFrame() def update_full_metrics_df(cur_epoch, report_df): nonlocal all_metrics all_metrics = all_metrics.append( report_df.rename(index={'test': cur_epoch}), sort=False) def check_point_func(): if not refit_on_all: self.model_checkpoint = deepcopy(self.model) def score_func(cur_epoch, step): self.fit_partial(train_obs_internal, epochs=step) lfm_report = self.eval_on_test_by_ranking( valid_obs.df_obs, include_train=False, prefix='', k=k) cur_score = float(lfm_report.loc['test', metric]) update_full_metrics_df(cur_epoch, lfm_report) return cur_score best_epoch = early_stopping_runner( score_func=score_func, check_point_func=check_point_func, epochs_start=epochs_start, epochs_max=epochs_max, epochs_step=epochs_step, stop_patience=stop_patience, decline_threshold=decline_threshold, plot_graph=plot_convergence ) simple_logger.info('Early stop, all_metrics:\n' + str(all_metrics)) if plot_convergence: all_metrics = all_metrics.divide(all_metrics.max()) all_metrics.plot() self.early_stop_metrics_df = all_metrics self._set_epochs(epochs=best_epoch) if refit_on_all: simple_logger.info('Refitting on whole train data for %d epochs' % best_epoch) self.fit(train_obs) else: simple_logger.info('Loading best model from checkpoint at %d epochs' % best_epoch) self.model, self.model_checkpoint = self.model_checkpoint, None return self
def __call__(self, result): best_result = result.fun cur_result = result.func_vals[-1] if best_result >= cur_result: # self.prev_result = cur_result simple_logger.info('best params, iteration %d' % len(result.func_vals)) simple_logger.info('params for loss=%f:' % cur_result) values = result.x_iters[-1] params = self.search_inst.values_to_dict(values) simple_logger.info(params)
def test_b_2_lfm_rec_evaluation(self): k = self.k rep_exact = self.state.lfm_rec.eval_on_test_by_ranking_exact( self.state.test_obs.df_obs, prefix='lfm regular exact ', k=k) logger.info(rep_exact) rep_reg = self.state.lfm_rec.eval_on_test_by_ranking( self.state.test_obs.df_obs, prefix='lfm regular ', n_rec=200, k=k) logger.info(rep_reg) self.assertListEqual(list(rep_reg.columns), list(rep_exact.columns)) # test that those fields are almost equal for the two test methods logger.info('deviations from exact evaluation') for col in rep_reg.columns: deviations = abs(1 - (rep_exact[col].values / rep_reg[col].values)) logger.info(f'{col}: {deviations}') if 'AUC' in col: self.assertTrue(all(deviations < 0.1)) elif 'coverage' in col: self.assertTrue(all(deviations < 0.03)) else: self.assertTrue(all(deviations < 0.01))
def compare_similarity_results(self, item_id, items_lists, scores_lists=None, names=None, print_data=True): df_item_source = self.items_filtered_by_ids([item_id]) items_dfs = [self.items_filtered_by_ids(l) for l in items_lists] # add variant and scores field if names is None: names = [str(i) for i in range(len(items_lists))] if scores_lists is None: scores_lists = [[5] * len(df) for df in items_dfs] all_lists = [ df.assign(variant=name, score=scores[:len(df)]) for df, name, scores in zip(items_dfs + [df_item_source], names + ['source'], scores_lists + [[0]]) ] all_data = pd.concat(all_lists, sort=False) # add counts all_data = all_data.join( all_data[self.item_id_col].value_counts().to_frame('count'), on=self.item_id_col) all_data = all_data.set_index(self.item_id_col) if print_data: logger.info('\n' + str(all_data)) mapper = self.mapper_class(all_data) # for view init if names is None: names = [''] * len(items_dfs) colors = self.mapper_class.get_n_spaced_colors(len(items_dfs)) # poor man's legend logger.info("Poor man's legend: " + str(list(zip(colors, names)))) for i in range(len(items_lists)): if scores_lists[i] is None: size = 5 else: # filter simil scores only for those items that we have data for listings_with_data = items_dfs[i][self.item_id_col].values simil_scores = [ score for listing_id, score in zip(items_lists[i], scores_lists[i]) if listing_id in listings_with_data ] size = 3 + (10 * np.array(simil_scores)**3).astype(np.int32) size = [int(el) for el in size] mapper.add_markers(items_dfs[i], size=size, color=colors[i]) mapper.add_heatmap(items_dfs[i], sensitivity=1, opacity=0.4, spread=50, color=colors[i]) mapper.add_markers(df_item_source, color='black', size=7) return mapper
def pickle(self, obj, remote_path, compress=True): logger.info('S3: pickling to %s' % remote_path) return self.write_binary(pickle.dumps(obj), remote_path, compress=compress)
def unpickle(self, remote_path): logger.info('S3: unpickling from %s' % remote_path) return pickle.loads(self.read(remote_path))
def local_to_remote(self, local_path, remote_path, compress=True): logger.info('S3: copying from %s to %s' % (local_path, remote_path)) with open(local_path, 'rb') as local: self.write_binary(local.read(), remote_path, compress=compress)