def test_get_irrelevant_items(self): top_n_evaluator = TopNEvaluator(ratings, None) top_n_evaluator.initialize() actual_irrelevant_items = top_n_evaluator.get_irrelevant_items("U1") expected_irrelevant_items = ["I10", "I11", "I12", "I13", "I14", "I15", "I16"] self.assertItemsEqual(expected_irrelevant_items, actual_irrelevant_items) actual_irrelevant_items = top_n_evaluator.get_irrelevant_items("U5") expected_irrelevant_items = [ "I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14", "I15", ] self.assertItemsEqual(expected_irrelevant_items, actual_irrelevant_items) # top_n_evaluator.get_irrelevant_items('U6') self.assertRaises(KeyError, top_n_evaluator.get_irrelevant_items, "U6")
def get_records_to_predict_topn(self): print('get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.top_n_evaluator = TopNEvaluator(self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize() self.important_records = self.top_n_evaluator.important_records if Constants.TEST_CONTEXT_REVIEWS_ONLY: self.important_records = ETLUtils.filter_records( self.important_records, Constants.HAS_CONTEXT_FIELD, [True]) self.records_to_predict =\ self.top_n_evaluator.get_records_to_predict() if Constants.MAX_SAMPLE_TEST_SET is not None: print('important_records %d' % len(self.important_records)) if len(self.important_records) > Constants.MAX_SAMPLE_TEST_SET: self.important_records = random.sample( self.important_records, Constants.MAX_SAMPLE_TEST_SET) else: message = 'WARNING max_sample_test_set is greater than the ' \ 'number of important records' print(message) self.top_n_evaluator.important_records = self.important_records self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.test_records = None gc.collect()
def get_records_to_predict_topn(self): print('get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator(self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.test_records = None gc.collect()
def export(self): print('export: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.important_reviews = [ review for review in self.test_reviews if review.rating == 5 ]
def test_calculate_important_items(self): actual_important_items =\ TopNEvaluator.find_important_records(ratings) expected_important_items = [{ 'user_id': 'U1', 'business_id': 'I1', 'stars': 5.0 }, { 'user_id': 'U1', 'business_id': 'I6', 'stars': 5.0 }, { 'user_id': 'U2', 'business_id': 'I11', 'stars': 5.0 }, { 'user_id': 'U2', 'business_id': 'I12', 'stars': 5.0 }, { 'user_id': 'U3', 'business_id': 'I14', 'stars': 5.0 }, { 'user_id': 'U4', 'business_id': 'I15', 'stars': 5.0 }] self.assertItemsEqual(expected_important_items, actual_important_items)
def main_evaluate(): I = my_i records = ETLUtils.load_json_file(RECORDS_FILE) # print('num_records', len(records)) test_file = RECORDS_FILE + '_test' test_records = ETLUtils.load_json_file(test_file) top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I) top_n_evaluator.find_important_records() # top_n_evaluator.initialize() # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json' top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE) predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt' predictions = rmse_calculator.read_targets_from_txt(predictions_file) # print('total predictions', len(predictions)) top_n_evaluator.evaluate(predictions) # print('precision', top_n_evaluator.precision) print('recall', top_n_evaluator.recall) return top_n_evaluator.recall
def test_get_items_to_predict(self): top_n_evaluator = TopNEvaluator(ratings, test_set) top_n_evaluator.I = 4 top_n_evaluator.N = 2 top_n_evaluator.initialize() items_to_predict = top_n_evaluator.get_records_to_predict() predictions = [0] * len(test_set) * (top_n_evaluator.I + 1) top_n_evaluator.evaluate(predictions) print(items_to_predict) for item in items_to_predict: print(item)
def test_create_top_n_lists(self): rating_list = { 'I1': 5.0, 'I2': 3.0, 'I3': 1.0, 'I4': 2.0, 'I5': 4.5, 'I6': 3.7 } expected_list_1 = ['I1'] expected_list_2 = ['I1', 'I5'] expected_list_3 = ['I1', 'I5', 'I6'] expected_list_4 = ['I1', 'I5', 'I6', 'I2'] expected_list_5 = ['I1', 'I5', 'I6', 'I2', 'I4'] expected_list_6 = ['I1', 'I5', 'I6', 'I2', 'I4', 'I3'] expected_list_7 = ['I1', 'I5', 'I6', 'I2', 'I4', 'I3'] self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 1), expected_list_1) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 2), expected_list_2) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 3), expected_list_3) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 4), expected_list_4) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 5), expected_list_5) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 6), expected_list_6) self.assertSequenceEqual( TopNEvaluator.create_top_n_list(rating_list, 7), expected_list_7)
def test_calculate_recall(self): N = 5 top_n_evaluator = TopNEvaluator([], [], N) top_n_list = ['I1', 'I2', 'I3', 'I4', 'I5'] item = 'I3' top_n_evaluator.update_num_hits(top_n_list, item) top_n_list = ['I1', 'I2', 'I3', 'I4', 'I5'] item = 'I6' top_n_evaluator.update_num_hits(top_n_list, item) top_n_list = ['I1', 'I6', 'I3', 'I4', 'I5'] item = 'I4' top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(2.0 / 3, top_n_evaluator.calculate_recall())
def main_export(): I = my_i records = ETLUtils.load_json_file(RECORDS_FILE) print('num_records', len(records)) test_records = ETLUtils.load_json_file(TEST_RECORDS_FILE) # test_reviews = review_metrics_extractor.build_reviews(test_records) # with open(TEST_REVIEWS_FILE, 'wb') as write_file: # pickle.dump(test_reviews, write_file, pickle.HIGHEST_PROTOCOL) # with open(TEST_REVIEWS_FILE, 'rb') as read_file: # test_reviews = pickle.load(read_file) # train_file = RECORDS_FILE + '_train' # train_records = ETLUtils.load_json_file(train_file) with open(USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I) top_n_evaluator.initialize(user_item_map) top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
def test_calculate_important_items(self): actual_important_items = TopNEvaluator.calculate_important_items(ratings) expected_important_items = [ {"user_id": "U1", "business_id": "I1", "stars": 5.0}, {"user_id": "U1", "business_id": "I6", "stars": 5.0}, {"user_id": "U2", "business_id": "I11", "stars": 5.0}, {"user_id": "U2", "business_id": "I12", "stars": 5.0}, {"user_id": "U3", "business_id": "I14", "stars": 5.0}, {"user_id": "U4", "business_id": "I15", "stars": 5.0}, ] self.assertItemsEqual(expected_important_items, actual_important_items)
def test_calculate_recall(self): N = 5 top_n_evaluator = TopNEvaluator([], [], N) top_n_list = ["I1", "I2", "I3", "I4", "I5"] item = "I3" top_n_evaluator.update_num_hits(top_n_list, item) top_n_list = ["I1", "I2", "I3", "I4", "I5"] item = "I6" top_n_evaluator.update_num_hits(top_n_list, item) top_n_list = ["I1", "I6", "I3", "I4", "I5"] item = "I4" top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(2.0 / 3, top_n_evaluator.calculate_recall())
def get_records_to_predict_topn(self): print( 'get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S") ) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.test_records = None gc.collect()
def test_create_top_n_lists(self): rating_list = {"I1": 5.0, "I2": 3.0, "I3": 1.0, "I4": 2.0, "I5": 4.5, "I6": 3.7} expected_list_1 = ["I1"] expected_list_2 = ["I1", "I5"] expected_list_3 = ["I1", "I5", "I6"] expected_list_4 = ["I1", "I5", "I6", "I2"] expected_list_5 = ["I1", "I5", "I6", "I2", "I4"] expected_list_6 = ["I1", "I5", "I6", "I2", "I4", "I3"] expected_list_7 = ["I1", "I5", "I6", "I2", "I4", "I3"] self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 1), expected_list_1) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 2), expected_list_2) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 3), expected_list_3) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 4), expected_list_4) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 5), expected_list_5) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 6), expected_list_6) self.assertSequenceEqual(TopNEvaluator.create_top_n_list(rating_list, 7), expected_list_7)
def export(self): print('export: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) I = my_i if REVIEW_TYPE: self.records = ETLUtils.filter_records( self.records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE]) self.test_records = ETLUtils.filter_records( self.test_records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE]) with open(USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, DATASET, 10, I) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() # self.top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE) self.important_records = self.top_n_evaluator.important_records
def test_update_num_hits(self): top_n_evaluator = TopNEvaluator([], []) self.assertEqual(0, top_n_evaluator.num_generic_hits) self.assertEqual(0, top_n_evaluator.num_generic_misses) top_n_list = ['I1', 'I2', 'I3', 'I4', 'I5'] item = 'I3' top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(1, top_n_evaluator.num_generic_hits) self.assertEqual(0, top_n_evaluator.num_generic_misses) top_n_list = ['I1', 'I2', 'I3', 'I4', 'I5'] item = 'I6' top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(1, top_n_evaluator.num_generic_hits) self.assertEqual(1, top_n_evaluator.num_generic_misses) top_n_list = ['I1', 'I6', 'I3', 'I4', 'I5'] item = 'I4' top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(2, top_n_evaluator.num_generic_hits) self.assertEqual(1, top_n_evaluator.num_generic_misses)
def test_update_num_hits(self): top_n_evaluator = TopNEvaluator([], []) self.assertEqual(0, top_n_evaluator.n_hit) self.assertEqual(0, top_n_evaluator.n_miss) top_n_list = ["I1", "I2", "I3", "I4", "I5"] item = "I3" top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(1, top_n_evaluator.n_hit) self.assertEqual(0, top_n_evaluator.n_miss) top_n_list = ["I1", "I2", "I3", "I4", "I5"] item = "I6" top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(1, top_n_evaluator.n_hit) self.assertEqual(1, top_n_evaluator.n_miss) top_n_list = ["I1", "I6", "I3", "I4", "I5"] item = "I4" top_n_evaluator.update_num_hits(top_n_list, item) self.assertEqual(2, top_n_evaluator.n_hit) self.assertEqual(1, top_n_evaluator.n_miss)
def test_get_irrelevant_items(self): top_n_evaluator = TopNEvaluator(ratings, None) top_n_evaluator.initialize() actual_irrelevant_items = top_n_evaluator.get_irrelevant_items('U1') expected_irrelevant_items = [ 'I10', 'I11', 'I12', 'I13', 'I14', 'I15', 'I16' ] self.assertItemsEqual(expected_irrelevant_items, actual_irrelevant_items) actual_irrelevant_items = top_n_evaluator.get_irrelevant_items('U5') expected_irrelevant_items = [ 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13', 'I14', 'I15' ] self.assertItemsEqual(expected_irrelevant_items, actual_irrelevant_items) # top_n_evaluator.get_irrelevant_items('U6') self.assertRaises(KeyError, top_n_evaluator.get_irrelevant_items, 'U6')
class ContextTopNRunner(object): def __init__(self): self.records = None self.original_records = None self.train_records = None self.test_records = None self.records_to_predict = None self.predictions = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = [] self.context_topics_map = None self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None def clear(self): print('clear: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # self.records = None self.train_records = None self.test_records = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = [] self.context_topics_map = None if Constants.SOLVER == Constants.LIBFM: os.remove(self.csv_train_file) os.remove(self.csv_test_file) os.remove(self.context_predictions_file) os.remove(self.context_train_file) os.remove(self.context_test_file) os.remove(self.context_log_file) self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None gc.collect() def create_tmp_file_names(self): unique_id = uuid.uuid4().hex prefix = Constants.GENERATED_FOLDER + unique_id + '_' + \ Constants.ITEM_TYPE # prefix = constants.GENERATED_FOLDER + constants.ITEM_TYPE print('unique id: %s' % unique_id) self.csv_train_file = prefix + '_train.csv' self.csv_test_file = prefix + '_test.csv' self.context_predictions_file = prefix + '_predictions.txt' self.context_train_file = self.csv_train_file + '.libfm' self.context_test_file = self.csv_test_file + '.libfm' self.context_log_file = prefix + '.log' @staticmethod def plant_seeds(): if Constants.RANDOM_SEED is not None: print('random seed: %d' % Constants.RANDOM_SEED) random.seed(Constants.RANDOM_SEED) if Constants.NUMPY_RANDOM_SEED is not None: print('numpy random seed: %d' % Constants.NUMPY_RANDOM_SEED) numpy.random.seed(Constants.NUMPY_RANDOM_SEED) def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # ETLUtils.drop_fields(['tagged_words'], self.original_records) print('num_records: %d' % len(self.original_records)) if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL) def shuffle(self): print('shuffle: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) random.shuffle(self.original_records) def get_records_to_predict_topn(self): print('get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator(self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.test_records = None gc.collect() def get_records_to_predict_rmse(self): print('get_records_to_predict_rmse: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.important_records = self.test_records self.records_to_predict = self.test_records self.test_records = None gc.collect() def get_records_to_predict(self): if Constants.EVALUATION_METRIC == 'topn_recall': self.get_records_to_predict_topn() elif Constants.EVALUATION_METRIC == 'rmse': self.get_records_to_predict_rmse() else: raise ValueError('Unrecognized evaluation metric') def train_topic_model(self, cycle_index, fold_index): if Constants.CACHE_TOPIC_MODEL: print('loading topic model') lda_based_context = topic_model_creator.load_topic_model( cycle_index, fold_index) else: print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context = LdaBasedContext(self.train_records) lda_based_context.generate_review_corpus() lda_based_context.build_topic_model() lda_based_context.update_reviews_with_topics() lda_based_context.get_context_rich_topics() self.context_rich_topics = lda_based_context.context_rich_topics print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return lda_based_context def find_reviews_topics(self, lda_based_context): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context.find_contextual_topics(self.train_records) lda_based_context.find_contextual_topics( self.important_records, Constants.TEXT_SAMPLING_PROPORTION) self.context_topics_map = {} for record in self.important_records: topic_distribution = record[Constants.TOPICS_FIELD] context_topics = {} for i in self.context_rich_topics: topic_id = 'topic' + str(i[0]) context_topics[topic_id] = topic_distribution[i[0]] record[Constants.CONTEXT_TOPICS_FIELD] = context_topics self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] =\ context_topics self.important_records = None gc.collect() def prepare_records_for_libfm(self): print('prepare_records_for_libfm: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(self.context_rich_topics) if Constants.USE_CONTEXT is True: if Constants.REVIEW_TYPE == Constants.SPECIFIC or \ Constants.REVIEW_TYPE == Constants.GENERIC: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.REVIEW_TYPE]) with open(self.csv_train_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.train_records: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: context_topics = record[Constants.CONTEXT_TOPICS_FIELD] # print('context_topics', context_topics) row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) self.train_records = None gc.collect() with open(self.csv_test_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.records_to_predict: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: important_record = record[Constants.REVIEW_ID_FIELD] context_topics =\ self.context_topics_map[important_record] row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) # self.records_to_predict = None self.context_topics_map = None self.context_rich_topics = None gc.collect() print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [self.csv_train_file, self.csv_test_file] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) def predict_fastfm(self): if Constants.USE_CONTEXT: for record in self.records_to_predict: important_record = record[Constants.REVIEW_ID_FIELD] record[Constants.CONTEXT_TOPICS_FIELD] = \ self.context_topics_map[important_record] all_records = self.train_records + self.records_to_predict x_matrix, y_vector = fastfm_recommender.records_to_matrix( all_records, self.context_rich_topics) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(x_matrix) x_train = encoder.transform(x_matrix[:len(self.train_records)]) y_train = y_vector[:len(self.train_records)] x_test = encoder.transform(x_matrix[len(self.train_records):]) if Constants.FASTFM_METHOD == 'mcmc': # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors) solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS) self.predictions = solver.fit_predict(x_train, y_train, x_test) elif Constants.FASTFM_METHOD == 'als': solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) elif Constants.FASTFM_METHOD == 'sgd': solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) def predict_libfm(self): print('predict: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) run_libfm(self.context_train_file, self.context_test_file, self.context_predictions_file, self.context_log_file) self.predictions = rmse_calculator.read_targets_from_txt( self.context_predictions_file) def predict(self): if Constants.SOLVER == Constants.LIBFM: self.prepare_records_for_libfm() self.predict_libfm() elif Constants.SOLVER == Constants.FASTFM: self.predict_fastfm() def evaluate_topn(self): print('evaluate_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.top_n_evaluator.evaluate(self.predictions) recall = self.top_n_evaluator.recall print('Recall: %f' % recall) print('Specific recall: %f' % self.top_n_evaluator.specific_recall) print('Generic recall: %f' % self.top_n_evaluator.generic_recall) return recall def evaluate_rmse(self): print('evaluate_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) true_values = [ record[Constants.RATING_FIELD] for record in self.records_to_predict ] rmse = rmse_calculator.calculate_rmse(true_values, self.predictions) print('RMSE: %f' % rmse) return rmse def evaluate(self): if Constants.EVALUATION_METRIC == 'topn_recall': return self.evaluate_topn() elif Constants.EVALUATION_METRIC == 'rmse': return self.evaluate_rmse() else: raise ValueError('Unrecognized evaluation metric') def perform_cross_validation(self): print(Constants._properties) self.plant_seeds() total_metric = 0.0 # total_specific_recall = 0.0 # total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1 / float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j + 1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records =\ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.get_records_to_predict() if Constants.USE_CONTEXT: lda_based_context = self.train_topic_model(i, j) self.find_reviews_topics(lda_based_context) self.predict() metric = self.evaluate() # recall = self.top_n_evaluator.recall # specific_recall = self.top_n_evaluator.specific_recall # generic_recall = self.top_n_evaluator.generic_recall total_metric += metric # total_specific_recall += specific_recall # total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j + 1), fold_time)) metric_average = total_metric / total_iterations # average_specific_recall = total_specific_recall / total_iterations # average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average rmse: %f' % metric_average) # print('average specific recall: %f' % average_specific_recall) # print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = copy.deepcopy(Constants._properties) results[Constants.EVALUATION_METRIC] = metric_average # results['specific_recall'] = average_specific_recall # results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results)
class ContextTopNRunner(object): def __init__(self): self.records = None self.train_records = None self.test_records = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = None self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None self.no_context_predictions_file = None self.no_context_train_file = None self.no_context_test_file = None self.no_context_log_file = None def clear(self): print('clear: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) self.records = None self.train_records = None self.test_records = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = None os.remove(self.csv_train_file) os.remove(self.csv_test_file) os.remove(self.context_predictions_file) os.remove(self.context_train_file) os.remove(self.context_test_file) os.remove(self.context_log_file) os.remove(self.no_context_predictions_file) os.remove(self.no_context_train_file) os.remove(self.no_context_test_file) os.remove(self.no_context_log_file) self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None self.no_context_predictions_file = None self.no_context_train_file = None self.no_context_test_file = None self.no_context_log_file = None def create_tmp_file_names(self): unique_id = uuid.uuid4().hex prefix = GENERATED_FOLDER + unique_id + '_' + DATASET self.csv_train_file = prefix + '_context_train.csv' self.csv_test_file = prefix + '_context_test.csv' self.context_predictions_file = prefix + '_context_predictions.txt' self.context_train_file = self.csv_train_file + '.context.libfm' self.context_test_file = self.csv_test_file + '.context.libfm' self.context_log_file = prefix + '_context.log' self.no_context_predictions_file =\ prefix + '_no_context_predictions.txt' self.no_context_train_file = self.csv_train_file + '.no_context.libfm' self.no_context_test_file = self.csv_test_file + '.no_context.libfm' self.no_context_log_file = prefix + '_no_context.log' def load(self): print('load: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) self.records = ETLUtils.load_json_file(RECORDS_FILE) print('num_records', len(self.records)) def shuffle(self): print('shuffle: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) random.shuffle(self.records) def split(self): print('split: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) num_records = len(self.records) num_split_records = int(float(SPLIT_PERCENTAGE)/100*num_records) self.train_records = self.records[:num_split_records] self.test_records = self.records[num_split_records:] def export(self): print('export: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) I = my_i if REVIEW_TYPE: self.records = ETLUtils.filter_records( self.records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE]) self.test_records = ETLUtils.filter_records( self.test_records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE]) with open(USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, DATASET, 10, I) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() # self.top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE) self.important_records = self.top_n_evaluator.important_records def train_topic_model(self): print('train topic model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) # self.train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) lda_based_context = LdaBasedContext(self.train_records) lda_based_context.get_context_rich_topics() self.context_rich_topics = lda_based_context.context_rich_topics print('Trained LDA Model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) # with open(TOPIC_MODEL_FILE, 'wb') as write_file: # pickle.dump(lda_based_context, write_file, pickle.HIGHEST_PROTOCOL) # with open(TOPIC_MODEL_FILE, 'rb') as read_file: # lda_based_context = pickle.load(read_file) self.context_rich_topics = lda_based_context.context_rich_topics return lda_based_context def find_reviews_topics(self, lda_based_context): print('find topics: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) # self.records_to_predict =\ # ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE) lda_based_context.find_contextual_topics(self.train_records) lda_based_context.find_contextual_topics(self.records_to_predict) # topics_map = {} # lda_based_context.find_contextual_topics(self.important_records) # for record in self.important_records: # topics_map[record[constants.REVIEW_ID_FIELD]] =\ # record[constants.TOPICS_FIELD] # # for record in self.records_to_predict: # topic_distribution = topics_map[record[constants.REVIEW_ID_FIELD]] # for i in self.context_rich_topics: # topic_id = 'topic' + str(i[0]) # record[topic_id] = topic_distribution[i[0]] print('contextual test set size: %d' % len(self.records_to_predict)) self.headers = build_headers(self.context_rich_topics) print('Exported contextual topics: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) return self.train_records, self.records_to_predict def prepare(self): print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set =\ ETLUtils.select_fields(self.headers, self.train_records) contextual_test_set =\ ETLUtils.select_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, contextual_train_set, self.headers) ETLUtils.save_csv_file( self.csv_test_file, contextual_test_set, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] num_cols = len(self.headers) context_cols = num_cols print('num_cols', num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True, suffix='.no_context.libfm') libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.context.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) def predict(self): print('predict: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) run_libfm( self.no_context_train_file, self.no_context_test_file, self.no_context_predictions_file, self.no_context_log_file) run_libfm( self.context_train_file, self.context_test_file, self.context_predictions_file, self.context_log_file) def evaluate(self): print('evaluate: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) predictions = rmse_calculator.read_targets_from_txt( self.no_context_predictions_file) self.top_n_evaluator.evaluate(predictions) no_context_recall = self.top_n_evaluator.recall predictions = rmse_calculator.read_targets_from_txt( self.context_predictions_file) self.top_n_evaluator.evaluate(predictions) context_recall = self.top_n_evaluator.recall print('No context recall: %f' % no_context_recall) print('Context recall: %f' % context_recall) return context_recall, no_context_recall def super_main_lda(self): total_context_recall = 0.0 total_no_context_recall = 0.0 total_cycle_time = 0.0 num_iterations = 10 self.load() self.split() self.export() for i in range(num_iterations): cycle_start = time.time() print('\nCycle: %d' % i) lda_based_context = self.train_topic_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() context_recall, no_context_recall = self.evaluate() total_context_recall += context_recall total_no_context_recall += no_context_recall cycle_end = time.time() cycle_time = cycle_end - cycle_start total_cycle_time += cycle_time print("Total cycle %d time = %f seconds" % (i, cycle_time)) average_context_recall = total_context_recall / num_iterations average_no_context_recall = total_no_context_recall / num_iterations average_cycle_time = total_cycle_time / num_iterations improvement =\ (average_context_recall / average_no_context_recall - 1) * 100 print('average no context recall', average_no_context_recall) print('average context recall', average_context_recall) print('average improvement: %f2.3%%' % improvement) print('average cycle time', average_cycle_time) print('End: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
class ContextTopNRunner(object): def __init__(self): self.records = None self.original_records = None self.train_records = None self.test_records = None self.records_to_predict = None self.predictions = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = None self.context_topics_map = None self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None self.libfm_model_file = None self.num_variables_in_model = None def clear(self): print('clear: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # self.records = None self.train_records = None self.test_records = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = None self.context_topics_map = None if Constants.SOLVER == Constants.LIBFM: os.remove(self.csv_train_file) os.remove(self.csv_test_file) os.remove(self.context_predictions_file) os.remove(self.context_train_file) os.remove(self.context_test_file) os.remove(self.context_log_file) os.remove(self.libfm_model_file) self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None self.libfm_model_file = None gc.collect() def create_tmp_file_names(self, cycle_index, fold_index): unique_id = uuid.uuid4().hex prefix = Constants.GENERATED_FOLDER + unique_id + '_' + \ Constants.ITEM_TYPE print('unique id: %s' % unique_id) self.csv_train_file = prefix + '_train.csv' self.csv_test_file = prefix + '_test.csv' self.context_predictions_file = prefix + '_predictions.txt' self.context_train_file = self.csv_train_file + '.libfm' self.context_test_file = self.csv_test_file + '.libfm' self.context_log_file = prefix + '.log' self.libfm_model_file = prefix + '_trained_model.libfm' # self.csv_train_file = Constants.generate_file_name( # 'libfm_train', 'csv', Constants.GENERATED_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) # self.csv_test_file = Constants.generate_file_name( # 'libfm_test', 'csv', Constants.GENERATED_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) # self.context_predictions_file = Constants.generate_file_name( # 'libfm_predictions', 'txt', Constants.GENERATED_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) # self.context_train_file = self.csv_train_file + '.libfm' # self.context_test_file = self.csv_test_file + '.libfm' # self.context_log_file = Constants.generate_file_name( # 'libfm_log', 'log', Constants.GENERATED_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) # self.libfm_model_file = Constants.generate_file_name( # 'libfm_model', 'csv', Constants.GENERATED_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.original_records =\ ETLUtils.load_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) else: self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('num_records: %d' % len(self.original_records)) user_ids = extractor.get_groupby_list(self.original_records, Constants.USER_ID_FIELD) item_ids = extractor.get_groupby_list(self.original_records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) def shuffle(self, records): print('shuffle: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) random.shuffle(records) def get_records_to_predict_topn(self): print('get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.top_n_evaluator = TopNEvaluator(self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize() self.important_records = self.top_n_evaluator.important_records if Constants.TEST_CONTEXT_REVIEWS_ONLY: self.important_records = ETLUtils.filter_records( self.important_records, Constants.HAS_CONTEXT_FIELD, [True]) self.records_to_predict =\ self.top_n_evaluator.get_records_to_predict() if Constants.MAX_SAMPLE_TEST_SET is not None: print('important_records %d' % len(self.important_records)) if len(self.important_records) > Constants.MAX_SAMPLE_TEST_SET: self.important_records = random.sample( self.important_records, Constants.MAX_SAMPLE_TEST_SET) else: message = 'WARNING max_sample_test_set is greater than the ' \ 'number of important records' print(message) self.top_n_evaluator.important_records = self.important_records self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.test_records = None gc.collect() def get_records_to_predict_rmse(self): print('get_records_to_predict_rmse: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.important_records = self.test_records if Constants.TEST_CONTEXT_REVIEWS_ONLY: self.important_records = ETLUtils.filter_records( self.important_records, Constants.HAS_CONTEXT_FIELD, [True]) self.records_to_predict = self.important_records self.test_records = None gc.collect() def get_records_to_predict(self, use_random_seeds): if use_random_seeds: utilities.plant_seeds() if Constants.EVALUATION_METRIC == 'topn_recall': self.get_records_to_predict_topn() elif Constants.EVALUATION_METRIC in ['rmse', 'mae']: self.get_records_to_predict_rmse() else: raise ValueError('Unrecognized evaluation metric') def train_topic_model(self, cycle_index, fold_index): context_extractor = topic_model_creator.create_topic_model( self.train_records, cycle_index, fold_index) self.context_rich_topics = context_extractor.context_rich_topics topics_file_path = Constants.generate_file_name( 'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) ETLUtils.save_json_file(topics_file_path, [dict(self.context_rich_topics)]) print('Trained Context Extractor: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return context_extractor def load_context_reviews(self, cycle_index, fold_index): train_records_file_path = Constants.generate_file_name( 'context_train_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) important_records_file_path = Constants.generate_file_name( 'context_important_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) self.train_records = ETLUtils.load_json_file(train_records_file_path) self.important_records = \ ETLUtils.load_json_file(important_records_file_path) self.load_cache_context_topics(cycle_index, fold_index) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] # self.train_records = self.filter_context_words(self.train_records) # self.print_context_topics(self.important_records) self.important_records = None gc.collect() def load_cache_context_topics(self, cycle_index, fold_index): print('load cache context topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) topics_file_path = Constants.generate_file_name( 'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) self.context_rich_topics = sorted( ETLUtils.load_json_file(topics_file_path)[0].items(), key=operator.itemgetter(1), reverse=True) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] def find_reviews_topics(self, context_extractor, cycle_index, fold_index): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) train_records_file_path = Constants.generate_file_name( 'context_train_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) if os.path.exists(train_records_file_path): self.train_records = \ ETLUtils.load_json_file(train_records_file_path) else: context_extractor.find_contextual_topics(self.train_records) ETLUtils.save_json_file(train_records_file_path, self.train_records) context_extractor.find_contextual_topics( self.important_records, Constants.TEXT_SAMPLING_PROPORTION) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] self.important_records = None gc.collect() @staticmethod def print_context_topics(records): dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE) all_context_topics = set() for record in records: words = [] corpus = record[Constants.CORPUS_FIELD] for element in corpus: word_id = element[0] word = dictionary[word_id] words.append(word + ' (' + str(word_id) + ')') context_topics = record[Constants.CONTEXT_TOPICS_FIELD] used_context_topics =\ dict((k, v) for k, v in context_topics.items() if v > 0.0) all_context_topics |= set(used_context_topics.keys()) print('words: %s' % ', '.join(words)) print('text: %s' % record[Constants.TEXT_FIELD]) print('bow', record[Constants.BOW_FIELD]) # print('pos tags', record[Constants.POS_TAGS_FIELD]) print(record[Constants.TOPICS_FIELD]) # # print(record[Constants.CONTEXT_TOPICS_FIELD]) print(used_context_topics) # print('') # print('important records: %d' % len(records)) # print('context records: %d' % len(context_records)) # print('no context records: %d' % len(no_context_records)) # print('all used context words', all_context_words) print('all used context topics', all_context_topics) # print('all used context words count: %d' % len(all_context_words)) print('all used context topics: %d' % len(all_context_topics)) def prepare_records_for_libfm(self): print('prepare_records_for_libfm: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(self.context_rich_topics) if Constants.FM_REVIEW_TYPE == Constants.SPECIFIC or \ Constants.FM_REVIEW_TYPE == Constants.GENERIC: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.FM_REVIEW_TYPE]) with open(self.csv_train_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.train_records: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: context_topics = record[Constants.CONTEXT_TOPICS_FIELD] for topic in self.context_rich_topics: # print('context_topics', context_topics) row.append(context_topics['topic' + str(topic[0])]) if Constants.USE_NO_CONTEXT_TOPICS_SUM: row.append(context_topics['nocontexttopics']) writer.writerow(row) self.train_records = None gc.collect() with open(self.csv_test_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.records_to_predict: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: important_record = record[Constants.REVIEW_ID_FIELD] context_topics = \ self.context_topics_map[important_record] for topic in self.context_rich_topics: row.append(context_topics['topic' + str(topic[0])]) if Constants.USE_NO_CONTEXT_TOPICS_SUM: row.append(context_topics['nocontexttopics']) writer.writerow(row) # self.records_to_predict = None self.context_topics_map = None self.context_rich_topics = None gc.collect() print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [self.csv_train_file, self.csv_test_file] print('num_cols', len(self.headers)) self.num_variables_in_model = libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # def predict_fastfm(self): # # if Constants.USE_CONTEXT: # for record in self.records_to_predict: # important_record = record[Constants.REVIEW_ID_FIELD] # record[Constants.CONTEXT_TOPICS_FIELD] = \ # self.context_topics_map[important_record] # # all_records = self.train_records + self.records_to_predict # x_matrix, y_vector = fastfm_recommender.records_to_matrix( # all_records, self.context_rich_topics) # # encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) # encoder.fit(x_matrix) # # x_train = encoder.transform(x_matrix[:len(self.train_records)]) # y_train = y_vector[:len(self.train_records)] # x_test = encoder.transform(x_matrix[len(self.train_records):]) # # if Constants.FASTFM_METHOD == 'mcmc': # # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors) # solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS) # self.predictions = solver.fit_predict(x_train, y_train, x_test) # elif Constants.FASTFM_METHOD == 'als': # solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS) # solver.fit(x_train, y_train) # self.predictions = solver.predict(x_test) # elif Constants.FASTFM_METHOD == 'sgd': # solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS) # solver.fit(x_train, y_train) # self.predictions = solver.predict(x_test) def predict_libfm(self): print('predict: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) run_libfm(self.context_train_file, self.context_test_file, self.context_predictions_file, self.context_log_file, self.libfm_model_file) self.predictions = rmse_calculator.read_targets_from_txt( self.context_predictions_file) def predict(self): if Constants.SOLVER == Constants.LIBFM: self.prepare_records_for_libfm() self.predict_libfm() # elif Constants.SOLVER == Constants.FASTFM: # self.predict_fastfm() def evaluate_topn(self): print('evaluate_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.top_n_evaluator.evaluate(self.predictions) recall = self.top_n_evaluator.recall print('Recall: %f' % recall) print('Specific recall: %f' % self.top_n_evaluator.specific_recall) print('Generic recall: %f' % self.top_n_evaluator.generic_recall) results = { Constants.TOPN_RECALL: self.top_n_evaluator.recall, Constants.SPECIFIC + '_' + Constants.TOPN_RECALL: self.top_n_evaluator.specific_recall, Constants.GENERIC + '_' + Constants.TOPN_RECALL: self.top_n_evaluator.generic_recall, Constants.HAS_CONTEXT + '_' + Constants.TOPN_RECALL: self.top_n_evaluator.has_context_recall, Constants.HAS_NO_CONTEXT + '_' + Constants.TOPN_RECALL: self.top_n_evaluator.has_no_context_recall, } return results def evaluate_rmse(self): print('evaluate_rmse: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) true_values = [ record[Constants.RATING_FIELD] for record in self.records_to_predict ] specific_true_values = [] specific_predictions = [] generic_true_values = [] generic_predictions = [] has_context_true_values = [] has_context_predictions = [] has_no_context_true_values = [] has_no_context_predictions = [] index = 0 for record, prediction in zip(self.records_to_predict, self.predictions): if record[Constants.PREDICTED_CLASS_FIELD] == 'specific': specific_true_values.append(record[Constants.RATING_FIELD]) specific_predictions.append(prediction) elif record[Constants.PREDICTED_CLASS_FIELD] == 'generic': generic_true_values.append(record[Constants.RATING_FIELD]) generic_predictions.append(prediction) if record[Constants.HAS_CONTEXT_FIELD]: has_context_true_values.append(record[Constants.RATING_FIELD]) has_context_predictions.append(prediction) else: has_no_context_true_values.append( record[Constants.RATING_FIELD]) has_no_context_predictions.append(prediction) index += 1 metric = Constants.EVALUATION_METRIC overall_result = None specific_result = None generic_result = None has_context_result = None has_no_context_result = None if metric == 'rmse': overall_result = \ rmse_calculator.calculate_rmse(true_values, self.predictions) specific_result = rmse_calculator.calculate_rmse( specific_true_values, specific_predictions) generic_result = rmse_calculator.calculate_rmse( generic_true_values, generic_predictions) has_context_result = rmse_calculator.calculate_rmse( has_context_true_values, has_context_predictions) has_no_context_result = rmse_calculator.calculate_rmse( has_no_context_true_values, has_no_context_predictions) elif metric == 'mae': overall_result = \ rmse_calculator.calculate_mae(true_values, self.predictions) specific_result = rmse_calculator.calculate_mae( specific_true_values, specific_predictions) generic_result = rmse_calculator.calculate_mae( generic_true_values, generic_predictions) has_context_result = rmse_calculator.calculate_mae( has_context_true_values, has_context_predictions) has_no_context_result = rmse_calculator.calculate_mae( has_no_context_true_values, has_no_context_predictions) print(metric + ': %f' % overall_result) print('Specific ' + metric + ': %f' % specific_result) print('Generic ' + metric + ': %f' % generic_result) print('Has context ' + metric + ': %f' % has_context_result) print('Has no ' + metric + ': %f' % has_no_context_result) results = { metric: overall_result, Constants.SPECIFIC + '_' + metric: specific_result, Constants.GENERIC + '_' + metric: generic_result, Constants.HAS_CONTEXT + '_' + metric: has_context_result, Constants.HAS_NO_CONTEXT + '_' + metric: has_no_context_result, } return results def evaluate(self): if Constants.EVALUATION_METRIC == 'topn_recall': return self.evaluate_topn() elif Constants.EVALUATION_METRIC in ['rmse', 'mae']: return self.evaluate_rmse() else: raise ValueError('Unrecognized evaluation metric') def perform_cross_validation(self, records): Constants.print_properties() # self.plant_seeds() metrics_list = [] total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1 / float(num_folds)) metric_name = Constants.EVALUATION_METRIC # self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j + 1), num_folds)) self.create_tmp_file_names(i, j) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(i, j) self.find_reviews_topics(context_extractor, i, j) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() metrics_list.append(metrics) print('Accumulated %s: %f' % (metric_name, numpy.mean([k[metric_name] for k in metrics_list]))) fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j + 1), fold_time)) results = self.summarize_results(metrics_list) average_cycle_time = total_cycle_time / total_iterations results['cycle_time'] = average_cycle_time print('average cycle time: %f' % average_cycle_time) write_results_to_csv(results) write_results_to_json(results) return results @staticmethod def summarize_results(metrics_list): metric_name = Constants.EVALUATION_METRIC specific_metric_name = Constants.SPECIFIC + '_' + metric_name generic_metric_name = Constants.GENERIC + '_' + metric_name has_context_metric_name = Constants.HAS_CONTEXT + '_' + metric_name has_no_context_metric_name = Constants.HAS_NO_CONTEXT + '_' + metric_name metric_average = \ numpy.mean(numpy.mean([k[metric_name] for k in metrics_list])) metric_stdev = numpy.std([k[metric_name] for k in metrics_list]) average_specific_metric = numpy.mean( [k[specific_metric_name] for k in metrics_list]) average_generic_metric = numpy.mean( [k[generic_metric_name] for k in metrics_list]) average_has_context_metric = numpy.mean( [k[has_context_metric_name] for k in metrics_list]) average_has_no_context_metric = numpy.mean( [k[has_no_context_metric_name] for k in metrics_list]) print('average %s:\t\t\t%f' % (metric_name, metric_average)) print('average specific %s:\t%f' % (metric_name, average_specific_metric)) print('average generic %s:\t%f' % (metric_name, average_generic_metric)) print('average has context %s:\t%f' % (metric_name, average_has_context_metric)) print('average has no context %s:\t%f' % (metric_name, average_has_no_context_metric)) print('standard deviation %s:\t%f (%f%%)' % (metric_name, metric_stdev, (metric_stdev / metric_average * 100))) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = Constants.get_properties_copy() results[metric_name] = metric_average results[specific_metric_name] = average_specific_metric results[generic_metric_name] = average_generic_metric results[has_context_metric_name] = average_has_context_metric results[has_no_context_metric_name] = average_has_no_context_metric results[metric_name + '_stdev'] = metric_stdev results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results) return results def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics def run(self): utilities.plant_seeds() self.load() records = self.original_records if Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(self.original_records, split, cv_start) return self.perform_cross_validation(records) elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': return self.perform_cross_validation(records) else: raise ValueError('Unknown cross-validation strategy')
class WordContextTopNRunner(object): def __init__(self): self.records = None self.reviews = None self.original_records = None self.original_reviews = None self.train_records = None self.train_reviews = None self.test_records = None self.test_reviews = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.important_reviews = None self.context_rich_topics = [] self.sense_groups = [] self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None def clear(self): print('clear: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # self.records = None self.train_records = None self.train_reviews = None self.test_records = None self.test_reviews = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.important_reviews = None self.context_rich_topics = [] self.sense_groups = [] os.remove(self.csv_train_file) os.remove(self.csv_test_file) os.remove(self.context_predictions_file) os.remove(self.context_train_file) os.remove(self.context_test_file) os.remove(self.context_log_file) self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None def create_tmp_file_names(self): unique_id = uuid.uuid4().hex prefix = Constants.GENERATED_FOLDER + unique_id + '_' + \ Constants.ITEM_TYPE # prefix = constants.GENERATED_FOLDER + constants.ITEM_TYPE print('unique id: %s' % unique_id) self.csv_train_file = prefix + '_train.csv' self.csv_test_file = prefix + '_test.csv' self.context_predictions_file = prefix + '_predictions.txt' self.context_train_file = self.csv_train_file + '.libfm' self.context_test_file = self.csv_test_file + '.libfm' self.context_log_file = prefix + '.log' def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE) with open(Constants.REVIEWS_FILE, 'rb') as read_file: self.original_reviews = pickle.load(read_file) print('num_records: %d' % len(self.original_records)) for record, review in zip(self.original_records, self.original_reviews): review.id = record[Constants.REVIEW_ID_FIELD] review.rating = record[Constants.RATING_FIELD] if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL) def shuffle(self): print('shuffle: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # random.shuffle(self.records) shuffled_records = [] shuffled_reviews = [] index_shuffle = range(len(self.original_records)) random.shuffle(index_shuffle) for i in index_shuffle: shuffled_records.append(self.original_records[i]) shuffled_reviews.append(self.original_reviews[i]) self.original_records = shuffled_records self.original_reviews = shuffled_reviews def export(self): print('export: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.important_reviews = [ review for review in self.test_reviews if review.rating == 5 ] def train_word_model(self): print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # lda_based_context = LdaBasedContext(self.train_records) # lda_based_context.get_context_rich_topics() # self.context_rich_topics = lda_based_context.context_rich_topics word_based_context = WordBasedContext(self.train_reviews) word_based_context.calculate_sense_group_ratios() self.sense_groups = word_based_context.sense_groups print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return word_based_context def find_reviews_topics(self, word_based_context): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # lda_based_context.find_contextual_topics(self.train_records) for record, review in zip(self.train_records, self.train_reviews): record[Constants.CONTEXT_WORDS_FIELD] =\ word_based_context.calculate_word_context(review) for record, review in zip(self.important_records, self.important_reviews): record[Constants.CONTEXT_WORDS_FIELD] =\ word_based_context.calculate_word_context(review) topics_map = {} for record in self.important_records: topics_map[record[Constants.REVIEW_ID_FIELD]] =\ record[Constants.CONTEXT_WORDS_FIELD] for record in self.records_to_predict: word_distribution = topics_map[record[Constants.REVIEW_ID_FIELD]] record[Constants.CONTEXT_WORDS_FIELD] = word_distribution print('contextual test set size: %d' % len(self.records_to_predict)) print('Exported contextual topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) def prepare(self): print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(len(self.sense_groups)) if Constants.USE_CONTEXT is True: for record in self.train_records: record.update(record[Constants.CONTEXT_WORDS_FIELD]) for record in self.records_to_predict: record.update(record[Constants.CONTEXT_WORDS_FIELD]) if Constants.FM_REVIEW_TYPE: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.FM_REVIEW_TYPE]) # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records) ETLUtils.keep_fields(self.headers, self.train_records) ETLUtils.keep_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, self.train_records, self.headers) ETLUtils.save_csv_file( self.csv_test_file, self.records_to_predict, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) def predict(self): print('predict: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) run_libfm( self.context_train_file, self.context_test_file, self.context_predictions_file, self.context_log_file) def evaluate(self): print('evaluate: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) predictions = rmse_calculator.read_targets_from_txt( self.context_predictions_file) self.top_n_evaluator.evaluate(predictions) recall = self.top_n_evaluator.recall print('Recall: %f' % recall) print('Specific recall: %f' % self.top_n_evaluator.specific_recall) print('Generic recall: %f' % self.top_n_evaluator.generic_recall) return recall def perform_cross_validation(self): Constants.print_properties() utilities.plant_seeds() total_recall = 0.0 total_specific_recall = 0.0 total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) self.reviews = copy.deepcopy(self.original_reviews) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.train_reviews, self.test_reviews = \ ETLUtils.split_train_test_copy( self.reviews, split=split, start=cv_start) self.export() if Constants.USE_CONTEXT: lda_based_context = self.train_word_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() self.evaluate() recall = self.top_n_evaluator.recall specific_recall = self.top_n_evaluator.specific_recall generic_recall = self.top_n_evaluator.generic_recall total_recall += recall total_specific_recall += specific_recall total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) average_recall = total_recall / total_iterations average_specific_recall = total_specific_recall / total_iterations average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average recall: %f' % average_recall) print('average specific recall: %f' % average_specific_recall) print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) results = Constants.get_properties_copy() results['recall'] = average_recall results['specific_recall'] = average_specific_recall results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") if not os.path.exists(Constants.CSV_RESULTS_FILE): with open(Constants.CSV_RESULTS_FILE, 'wb') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writeheader() w.writerow(results) else: with open(Constants.CSV_RESULTS_FILE, 'a') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writerow(results)
class ContextTopNRunner(object): def __init__(self): self.records = None self.original_records = None self.train_records = None self.test_records = None self.records_to_predict = None self.predictions = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = [] self.context_topics_map = None self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None def clear(self): print('clear: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # self.records = None self.train_records = None self.test_records = None self.records_to_predict = None self.top_n_evaluator = None self.headers = None self.important_records = None self.context_rich_topics = [] self.context_topics_map = None if Constants.SOLVER == Constants.LIBFM: os.remove(self.csv_train_file) os.remove(self.csv_test_file) os.remove(self.context_predictions_file) os.remove(self.context_train_file) os.remove(self.context_test_file) os.remove(self.context_log_file) self.csv_train_file = None self.csv_test_file = None self.context_predictions_file = None self.context_train_file = None self.context_test_file = None self.context_log_file = None gc.collect() def create_tmp_file_names(self): unique_id = uuid.uuid4().hex prefix = Constants.GENERATED_FOLDER + unique_id + '_' + \ Constants.ITEM_TYPE # prefix = constants.GENERATED_FOLDER + constants.ITEM_TYPE print('unique id: %s' % unique_id) self.csv_train_file = prefix + '_train.csv' self.csv_test_file = prefix + '_test.csv' self.context_predictions_file = prefix + '_predictions.txt' self.context_train_file = self.csv_train_file + '.libfm' self.context_test_file = self.csv_test_file + '.libfm' self.context_log_file = prefix + '.log' @staticmethod def plant_seeds(): if Constants.RANDOM_SEED is not None: print('random seed: %d' % Constants.RANDOM_SEED) random.seed(Constants.RANDOM_SEED) if Constants.NUMPY_RANDOM_SEED is not None: print('numpy random seed: %d' % Constants.NUMPY_RANDOM_SEED) numpy.random.seed(Constants.NUMPY_RANDOM_SEED) def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # ETLUtils.drop_fields(['tagged_words'], self.original_records) print('num_records: %d' % len(self.original_records)) if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL) def shuffle(self): print('shuffle: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) random.shuffle(self.original_records) def get_records_to_predict_topn(self): print( 'get_records_to_predict_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S") ) with open(Constants.USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) self.top_n_evaluator = TopNEvaluator( self.records, self.test_records, Constants.ITEM_TYPE, 10, Constants.TOPN_NUM_ITEMS) self.top_n_evaluator.initialize(user_item_map) self.records_to_predict = self.top_n_evaluator.get_records_to_predict() self.important_records = self.top_n_evaluator.important_records self.test_records = None gc.collect() def get_records_to_predict_rmse(self): print( 'get_records_to_predict_rmse: %s' % time.strftime("%Y/%m/%d-%H:%M:%S") ) self.important_records = self.test_records self.records_to_predict = self.test_records self.test_records = None gc.collect() def get_records_to_predict(self): if Constants.EVALUATION_METRIC == 'topn_recall': self.get_records_to_predict_topn() elif Constants.EVALUATION_METRIC == 'rmse': self.get_records_to_predict_rmse() else: raise ValueError('Unrecognized evaluation metric') def train_topic_model(self, cycle_index, fold_index): if Constants.CACHE_TOPIC_MODEL: print('loading topic model') lda_based_context = topic_model_creator.load_topic_model( cycle_index, fold_index) else: print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context = LdaBasedContext(self.train_records) lda_based_context.generate_review_corpus() lda_based_context.build_topic_model() lda_based_context.update_reviews_with_topics() lda_based_context.get_context_rich_topics() self.context_rich_topics = lda_based_context.context_rich_topics print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return lda_based_context def find_reviews_topics(self, lda_based_context): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context.find_contextual_topics(self.train_records) lda_based_context.find_contextual_topics( self.important_records, Constants.TEXT_SAMPLING_PROPORTION) self.context_topics_map = {} for record in self.important_records: topic_distribution = record[Constants.TOPICS_FIELD] context_topics = {} for i in self.context_rich_topics: topic_id = 'topic' + str(i[0]) context_topics[topic_id] = topic_distribution[i[0]] record[Constants.CONTEXT_TOPICS_FIELD] = context_topics self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] =\ context_topics self.important_records = None gc.collect() def prepare_records_for_libfm(self): print('prepare_records_for_libfm: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(self.context_rich_topics) if Constants.REVIEW_TYPE == Constants.SPECIFIC or \ Constants.REVIEW_TYPE == Constants.GENERIC: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.REVIEW_TYPE]) with open(self.csv_train_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.train_records: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: context_topics = record[Constants.CONTEXT_TOPICS_FIELD] # print('context_topics', context_topics) row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) self.train_records = None gc.collect() with open(self.csv_test_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.records_to_predict: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: important_record = record[Constants.REVIEW_ID_FIELD] context_topics =\ self.context_topics_map[important_record] row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) # self.records_to_predict = None self.context_topics_map = None self.context_rich_topics = None gc.collect() print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) def predict_fastfm(self): if Constants.USE_CONTEXT: for record in self.records_to_predict: important_record = record[Constants.REVIEW_ID_FIELD] record[Constants.CONTEXT_TOPICS_FIELD] = \ self.context_topics_map[important_record] all_records = self.train_records + self.records_to_predict x_matrix, y_vector = fastfm_recommender.records_to_matrix( all_records, self.context_rich_topics) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(x_matrix) x_train = encoder.transform(x_matrix[:len(self.train_records)]) y_train = y_vector[:len(self.train_records)] x_test = encoder.transform(x_matrix[len(self.train_records):]) if Constants.FASTFM_METHOD == 'mcmc': # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors) solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS) self.predictions = solver.fit_predict(x_train, y_train, x_test) elif Constants.FASTFM_METHOD == 'als': solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) elif Constants.FASTFM_METHOD == 'sgd': solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) def predict_libfm(self): print('predict: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) run_libfm( self.context_train_file, self.context_test_file, self.context_predictions_file, self.context_log_file) self.predictions = rmse_calculator.read_targets_from_txt( self.context_predictions_file) def predict(self): if Constants.SOLVER == Constants.LIBFM: self.prepare_records_for_libfm() self.predict_libfm() elif Constants.SOLVER == Constants.FASTFM: self.predict_fastfm() def evaluate_topn(self): print('evaluate_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.top_n_evaluator.evaluate(self.predictions) recall = self.top_n_evaluator.recall print('Recall: %f' % recall) print('Specific recall: %f' % self.top_n_evaluator.specific_recall) print('Generic recall: %f' % self.top_n_evaluator.generic_recall) return recall, self.top_n_evaluator.specific_recall,\ self.top_n_evaluator.generic_recall def evaluate_rmse(self): print('evaluate_topn: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) true_values = [ record[Constants.RATING_FIELD] for record in self.records_to_predict ] specific_true_values = [] specific_predictions = [] generic_true_values = [] generic_predictions = [] index = 0 for record, prediction in zip( self.records_to_predict, self.predictions): if record[Constants.PREDICTED_CLASS_FIELD] == 'specific': specific_true_values.append(record[Constants.RATING_FIELD]) specific_predictions.append(prediction) elif record[Constants.PREDICTED_CLASS_FIELD] == 'generic': generic_true_values.append(record[Constants.RATING_FIELD]) generic_predictions.append(prediction) index += 1 rmse = rmse_calculator.calculate_rmse(true_values, self.predictions) specific_rmse = rmse_calculator.calculate_rmse( specific_true_values, specific_predictions) generic_rmse = rmse_calculator.calculate_rmse( generic_true_values, generic_predictions) print('RMSE: %f' % rmse) print('Specific RMSE: %f' % specific_rmse) print('Generic RMSE: %f' % generic_rmse) return rmse, specific_rmse, generic_rmse def evaluate(self): if Constants.EVALUATION_METRIC == 'topn_recall': return self.evaluate_topn() elif Constants.EVALUATION_METRIC == 'rmse': return self.evaluate_rmse() else: raise ValueError('Unrecognized evaluation metric') def perform_cross_validation(self): print(Constants._properties) self.plant_seeds() total_metric = 0.0 total_specific_metric = 0.0 total_generic_metric = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records =\ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.get_records_to_predict() if Constants.USE_CONTEXT: lda_based_context = self.train_topic_model(i, j) self.find_reviews_topics(lda_based_context) self.predict() metric, specific_metric, generic_metric = self.evaluate() total_metric += metric total_specific_metric += specific_metric total_generic_metric += generic_metric fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) metric_name = Constants.EVALUATION_METRIC metric_average = total_metric / total_iterations average_specific_metric = total_specific_metric / total_iterations average_generic_metric = total_generic_metric / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average %s: %f' % (metric_name, metric_average)) print( 'average specific %s: %f' % (metric_name, average_specific_metric)) print('average generic %s: %f' % (metric_name, average_generic_metric)) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = copy.deepcopy(Constants._properties) results[Constants.EVALUATION_METRIC] = metric_average results['specific_' + metric_name] = average_specific_metric results['generic_' + metric_name] = average_generic_metric results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results)