def create_models(self): """Create learning models and the aggregator.""" self.all_ratings = AllRatingsWithCommon( experts=self.users, objects=self.videos, output_features=self.features, name="prod", ) print_memory(stage="DPLF:ratings_nodata_created") # creating models self.user_to_model = { user: FeaturelessPreferenceLearningModel( expert=user, all_ratings=self.all_ratings ) for user in self.users } print_memory(stage="DPLF:models_created") # before creating the aggregator, filling models with data self.user_to_size = { user: self.fill_model_data(self.user_to_model[user], user) for user in tqdmem(self.users, desc="fill_data") } # virtual 'common' data fplm_common = FeaturelessPreferenceLearningModel( expert=AllRatingsWithCommon.COMMON_EXPERT, all_ratings=self.all_ratings ) fplm_common.on_dataset_end() print_memory(stage="DPLF:data_filled") # resetting the model given the data self.all_ratings.reset_model() print_memory(stage="DPLF:model_reset_ok") # aggregating models self.aggregator = FeaturelessMedianPreferenceAverageRegularizationAggregator( models=[self.user_to_model[u] for u in self.users] ) self.aggregator.certification_status = self.user_certified print_memory(stage="DPLF:aggregator_created")
def test_save_load(mode): assert mode in ['sparse', 'dense'] dataset = ToyRandomDataset() dataset._generate_many(100) all_ratings1 = AllRatingsWithCommon( experts=dataset.users, objects=dataset.objects, output_features=dataset.fields, name="tst", var_init_cls=VariableIndexLayer if mode == 'dense' else SparseVariableIndexLayer, ) # creating models models1 = [ FeaturelessPreferenceLearningModel(expert=user, all_ratings=all_ratings1) for user in dataset.users ] def load_data_to(models): for r in dataset.ratings: u_idx = dataset.users.index(r["user"]) ratings_as_vector = np.array( [r["ratings"][k] for k in dataset.fields]) / 100.0 models[u_idx].register_preference( o1=r["o1"], o2=r["o2"], p1_vs_p2=ratings_as_vector, weights=np.ones(len(ratings_as_vector)), ) load_data_to(models1) call_on_dataset_end(models1) aggregator1 = FeaturelessMedianPreferenceAverageRegularizationAggregator( hypers={ "lambda_": 1.0, "mu": 1.0, "C": 1.0, "default_score_value": 1.0 }, models=models1, loss_fcn=loss_fcn_dense if mode == 'dense' else loss_fcn_sparse, ) aggregator1.fit(epochs=100) all_ratings2 = AllRatingsWithCommon( experts=dataset.users, objects=dataset.objects, output_features=dataset.fields, name="tst", var_init_cls=VariableIndexLayer if mode == 'dense' else SparseVariableIndexLayer, ) # creating models models2 = [ FeaturelessPreferenceLearningModel(expert=user, all_ratings=all_ratings2) for user in dataset.users ] load_data_to(models2) call_on_dataset_end(models2) aggregator2 = FeaturelessMedianPreferenceAverageRegularizationAggregator( hypers={ "lambda_": 1.0, "mu": 1.0, "C": 1.0, "default_score_value": 1.0 }, loss_fcn=loss_fcn_dense if mode == 'dense' else loss_fcn_sparse, models=models2, ) def is_close(): out1 = aggregator1(dataset.objects) out2 = aggregator2(dataset.objects) assert isinstance(out1, np.ndarray), type(out1) assert isinstance(out2, np.ndarray), type(out2) assert out1.shape == out2.shape, (out1.shape, out2.shape) out1[out1 == None] = np.nan # noqa: E711 out2[out2 == None] = np.nan # noqa: E711 out1 = np.array(out1, dtype=np.float32) out2 = np.array(out2, dtype=np.float32) assert out1.dtype == out2.dtype, (out1.dtype, out2.dtype) return np.allclose(out1, out2) assert not is_close(), "Outputs already the same" save_dir = "./test-" + str(uuid1()) + "/" os.mkdir(save_dir) aggregator1.save(save_dir) aggregator2.load(save_dir) assert is_close(), "Outputs differ" shutil.rmtree(save_dir)
def test_hardcoded_dataset(mode): assert mode in ['sparse', 'dense'] dataset = ToyHardcodedDataset() dataset._generate_many(100) all_ratings = AllRatingsWithCommon( experts=dataset.users, objects=dataset.objects, output_features=dataset.fields, name="tst", var_init_cls=VariableIndexLayer if mode == 'dense' else SparseVariableIndexLayer, ) # creating models models = [ FeaturelessPreferenceLearningModel(expert=user, all_ratings=all_ratings) for user in dataset.users ] for r in dataset.ratings: u_idx = dataset.users.index(r["user"]) ratings_as_vector = np.array([r["ratings"][k] for k in dataset.fields]) / 100.0 models[u_idx].register_preference( o1=r["o1"], o2=r["o2"], p1_vs_p2=ratings_as_vector, weights=np.ones(len(ratings_as_vector)), ) call_on_dataset_end(models) # aggregating models aggregator = FeaturelessMedianPreferenceAverageRegularizationAggregator( models=models, loss_fcn=loss_fcn_dense if mode == 'dense' else loss_fcn_sparse, hypers={ "C": 1.0, "mu": 1.0, "lambda_": 1.0, "default_score_value": 1.0, "sample_every": 100 }, batch_params=dict( sample_experts=5000, sample_ratings_per_expert=5000, sample_objects_per_expert=5000, ), ) aggregator.fit(epochs=1000) result = aggregator.models[0](["trump_video"])[0] assert isinstance(result, np.ndarray), "Wrong output" result = aggregator(["trump_video"])[0] assert isinstance(result, np.ndarray), "Wrong output" aggregator.plot_loss() plt.savefig("_test_plot.png") def validate_order(dataset, aggregator): """Test that downvoted videos have smaller ratings.""" for user_id, user in enumerate(dataset.users): got_scores = aggregator.models[user_id](dataset.objects) expect_scores = dataset.scores_dict[user] errors = 0 for i, feature in enumerate(dataset.fields): for i1, o1 in enumerate(dataset.objects): for i2, o2 in enumerate(dataset.objects): if o1 == o2: continue delta1 = got_scores[i2][i] - got_scores[i1][i] if (o1, o2) in expect_scores[feature]: delta2 = expect_scores[feature][(o1, o2)] else: delta2 = 100 - expect_scores[feature][(o2, o1)] delta2 = (delta2 - 50) / 50.0 if delta1 * delta2 <= 0: print( f"Invalid result: {user} {feature} {o1} {o2} got" f" {got_scores[i1][i]} {got_scores[i2][i]} rating {delta2}" ) errors += 1 else: print("Valid result") assert not errors, "There were %s errors" % errors validate_order(dataset, aggregator)
def test_loss_computation(): """Implementing the loss once again in numpy ...and checking that the tf version computes the same thing.""" users = range(np.random.randint(1, 100)) objects = range(np.random.randint(1, 1000)) fields = range(np.random.randint(1, 100)) # creating the table all_ratings = AllRatingsWithCommon(experts=users, objects=objects, output_features=fields, name="tst", var_init_cls=VariableIndexLayer) # setting a fixed value as the current model parameters ratings_val = np.random.randn(1 + len(users), len(objects), len(fields)) all_ratings.layer.v.assign(ratings_val) # creating models models = [ FeaturelessPreferenceLearningModel(expert=user, all_ratings=all_ratings) for user in users ] # random hyperparamters hypers = { "C": np.random.rand(), "mu": np.random.rand(), "lambda_": np.random.rand(), "default_score_value": 1.0, } # aggregating models aggregator = FeaturelessMedianPreferenceAverageRegularizationAggregator( models=models, hypers=hypers, loss_fcn=loss_fcn_dense) # inputs to the loss function experts_rating, objects_rating_v1, objects_rating_v2, cmp, weights = ( [], [], [], [], [], ) experts_all, objects_all, num_ratings_all = [], [], [] objects_common_to_1 = [] # generating mock data n_ratings = np.random.randint(1, 500) n_all = np.random.randint(1, 500) for r in range(n_ratings): experts_rating.append(np.random.choice(users)) objects_rating_v1.append(np.random.choice(objects)) objects_rating_v2.append(np.random.choice(objects)) cmp.append(np.random.randn(len(fields))) weights.append(np.random.rand(len(fields))) for v in range(n_all): experts_all.append(np.random.choice(users)) objects_all.append(np.random.choice(objects)) num_ratings_all.append(np.random.randint(1, 50)) for v in range(n_all): objects_common_to_1.append(np.random.choice(objects)) def np_loss_fcn( experts_rating, objects_rating_v1, objects_rating_v2, cmp, weights, experts_all, objects_all, num_ratings_all, objects_common_to_1, ): """Compute the loss using numpy, same as aggregator.loss_fcn.""" result = {} # FIT LOSS CALCULATION loss_fit = 0.0 loss_fit_cnt = 0 for exp, v1, v2, c, wei in zip(experts_rating, objects_rating_v1, objects_rating_v2, cmp, weights): for f in range(len(fields)): thetav = ratings_val[exp, v1, f] thetaw = ratings_val[exp, v2, f] y = c[f] w = wei[f] elem = np.log(1 + np.exp(y * (thetav - thetaw))) * w loss_fit += elem loss_fit_cnt += 1 result["loss_fit"] = loss_fit # LOSS M to COMMON computation loss_reg_common = 0.0 loss_reg_common_cnt = 0 for exp, v, n in zip(experts_all, objects_all, num_ratings_all): for f in range(len(fields)): theta = ratings_val[exp, v, f] s = ratings_val[-1, v, f] elem = n / (hypers["C"] + n) * np.abs(theta - s) loss_reg_common += elem loss_reg_common_cnt += 1 result["loss_m_to_common"] = loss_reg_common * hypers["lambda_"] # LOSS COMMON to 1 COMPUTATION loss_reg_c1 = 0.0 loss_reg_c1_cnt = 0 for v in objects_common_to_1: for f in range(len(fields)): s = ratings_val[-1, v, f] elem = np.square(s - 1) loss_reg_c1 += elem loss_reg_c1_cnt += 1 result["loss_common_to_1"] = loss_reg_c1 * hypers["mu"] # TOTAL LOSS COMPUTATION result["loss"] = (result["loss_fit"] + result["loss_m_to_common"] + result["loss_common_to_1"]) return result # computing the loss args = [ experts_rating, objects_rating_v1, objects_rating_v2, cmp, weights, experts_all, objects_all, num_ratings_all, objects_common_to_1, ] args_names = [ "experts_rating", "objects_rating_v1", "objects_rating_v2", "cmp", "weights", "experts_all", "objects_all", "num_ratings_all", "objects_common_to_1", ] args = [np.array(x) for x in args] args = [ tf.constant(x, dtype=tf.float32) if x.dtype == np.float64 else tf.constant(x) for x in args ] ans_tf = aggregator.loss_fcn(**dict(zip(args_names, args))) ans_tf = {k: v.numpy() for k, v in ans_tf.items()} # computing the numpy version ans_np = np_loss_fcn( experts_rating, objects_rating_v1, objects_rating_v2, cmp, weights, experts_all, objects_all, num_ratings_all, objects_common_to_1, ) # verifying that the results are the same assert ans_tf.keys() == ans_np.keys() for key in ans_tf.keys(): assert np.allclose(ans_tf[key], ans_np[key]), f"Wrong value for loss {key}" print(f"Correct value for loss {key}")
def create_aggregator(dataset, mode=None, with_weights=True, with_cert=True): assert mode in ["sparse", "dense"] var_init_cls = (VariableIndexLayer if mode == "dense" else SparseVariableIndexLayer) loss_fcn = loss_fcn_dense if mode == "dense" else loss_fcn_sparse all_ratings = AllRatingsWithCommon( experts=dataset.users, objects=dataset.objects, output_features=dataset.fields, name="tst", var_init_cls=var_init_cls, ) # creating models models = [ FeaturelessPreferenceLearningModel(expert=user, all_ratings=all_ratings) for user in dataset.users ] for r in dataset.ratings: u_idx = dataset.users.index(r["user"]) ratings_as_vector = ( np.array([r["ratings"][k] for k in dataset.fields]) / 100.0) if with_weights: weights_as_vector = np.array( [r["weights"][k] for k in dataset.fields]) else: weights_as_vector = np.ones(len(dataset.fields)) models[u_idx].register_preference( o1=r["o1"], o2=r["o2"], p1_vs_p2=ratings_as_vector, weights=weights_as_vector, ) call_on_dataset_end(models) # aggregating models aggregator = FeaturelessMedianPreferenceAverageRegularizationAggregator( models=models, loss_fcn=loss_fcn, optimizer=tf.keras.optimizers.SGD(lr=1e-3), hypers={ "C": 1.0, "mu": 1.0, "lambda_": 1.0, "default_score_value": 1.0 }, batch_params=dict( sample_experts=5000, sample_ratings_per_expert=5000, sample_objects_per_expert=5000, ), ) params = aggregator.all_ratings.layer.v params.assign(tf.zeros_like(params)) if with_cert: aggregator.certification_status = [ np.random.rand() > 0.5 for _ in range(len(dataset.users)) ] return aggregator
class DatabasePreferenceLearnerFeatureless(DatabasePreferenceLearner): """Learn models from the database, save/restore.""" def create_models(self): """Create learning models and the aggregator.""" self.all_ratings = AllRatingsWithCommon( experts=self.users, objects=self.videos, output_features=self.features, name="prod", ) print_memory(stage="DPLF:ratings_nodata_created") # creating models self.user_to_model = { user: FeaturelessPreferenceLearningModel( expert=user, all_ratings=self.all_ratings ) for user in self.users } print_memory(stage="DPLF:models_created") # before creating the aggregator, filling models with data self.user_to_size = { user: self.fill_model_data(self.user_to_model[user], user) for user in tqdmem(self.users, desc="fill_data") } # virtual 'common' data fplm_common = FeaturelessPreferenceLearningModel( expert=AllRatingsWithCommon.COMMON_EXPERT, all_ratings=self.all_ratings ) fplm_common.on_dataset_end() print_memory(stage="DPLF:data_filled") # resetting the model given the data self.all_ratings.reset_model() print_memory(stage="DPLF:model_reset_ok") # aggregating models self.aggregator = FeaturelessMedianPreferenceAverageRegularizationAggregator( models=[self.user_to_model[u] for u in self.users] ) self.aggregator.certification_status = self.user_certified print_memory(stage="DPLF:aggregator_created") def visualize(self): """Plot model predictions and losses.""" self.aggregator.plot_loss() self.save_figure() def predict_user(self, user, videos): # @todo: use vectorized operations assert isinstance(user, UserPreferences) model = self.user_to_model[user.id] result = list(model([v.video_id for v in videos])) for i, video in enumerate(videos): if not model.ratings_with_object(video.video_id): result[i] = None return result def predict_aggregated(self, videos): # @todo: use vectorized operations return self.aggregator([v.video_id for v in videos]) def fit(self, **kwargs): """Fit on latest database records.""" self.stats["dataset_size"] = self.user_to_size super(DatabasePreferenceLearnerFeatureless, self).fit(**kwargs) def fill_model_data(self, model, user): """Populate model data from db.""" n = 0 for dct in self.get_dataset(user=user): v1, v2, res, w = [ dct[key] for key in ["video_1", "video_2", "cmp", "weights"] ] model.register_preference(v1, v2, res, w) n += 1 model.on_dataset_end() return n