def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None): contextual_train_set, contextual_test_set = self.full_cycle( train_records, test_records, train_reviews, test_reviews ) print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json' csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv" # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json' csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv" # ETLUtils.save_json_file(json_train_file, contextual_train_set) ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers) # ETLUtils.save_json_file(json_test_file, contextual_test_set) ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers) print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [csv_train_file, csv_test_file] num_cols = len(self.headers) context_cols = num_cols print("num_cols", num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm" ) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm") print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
def export_without_context(self): print('%s: exporting to CARSKit binary ratings format without context' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] numpy.random.seed(0) for record in self.records: context_na_value = 1 new_records.append({ Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], 'context:na': context_na_value, }) headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def prepare(self): print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set =\ ETLUtils.select_fields(self.headers, self.train_records) contextual_test_set =\ ETLUtils.select_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, contextual_train_set, self.headers) ETLUtils.save_csv_file( self.csv_test_file, contextual_test_set, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] num_cols = len(self.headers) context_cols = num_cols print('num_cols', num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True, suffix='.no_context.libfm') libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.context.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
def run_top_n_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append( process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def parallel_run_topn_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product( [records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type] ) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append(context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def main(): topic_model_creator.plant_seeds() my_resamplers = [ None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2', 'smote_tomek', 'smoteenn' ] my_classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf', probability=True), SVC(C=1.0, kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(probability=True), RandomForestClassifier(n_estimators=100) ] document_levels = ['review', 'sentence', 1] num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels) index = 1 results_list = [] for document_level in document_levels: Constants.DOCUMENT_LEVEL = document_level my_records = load_records() preprocess_records(my_records) x_matrix, y_vector = transform(my_records) count_specific_generic(my_records) for resampler, classifier in itertools.product(my_resamplers, my_classifiers): print('Cycle %d/%d' % (index, num_cyles)) classification_results =\ test_classifier(x_matrix, y_vector, resampler, classifier) results_list.append(classification_results) index += 1 for results in results_list: print(results) csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\ '_sentence_classifier_results.csv' ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def parallel_run_topn_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product([records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type]) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append( context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def main(): topic_model_creator.plant_seeds() my_resamplers = [ None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2', 'smote_tomek', 'smoteenn' ] my_classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf', probability=True), SVC(C=1.0, kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(probability=True), RandomForestClassifier(n_estimators=100) ] max_sentences_list = [None, 1] num_cyles = len(my_resamplers) * len(my_classifiers) * len( max_sentences_list) index = 1 results_list = [] for max_sentences in max_sentences_list: Constants.MAX_SENTENCES = max_sentences my_records = load_records() preprocess_records(my_records) x_matrix, y_vector = transform(my_records) count_specific_generic(my_records) for resampler, classifier in itertools.product(my_resamplers, my_classifiers): print('Cycle %d/%d' % (index, num_cyles)) classification_results =\ test_classifier(x_matrix, y_vector, resampler, classifier) results_list.append(classification_results) index += 1 for results in results_list: print(results) csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\ '_sentence_classifier_results.csv' ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def run_top_n_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append(process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def export_as_predefined_context(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] context_categories = utilities.context_words[Constants.ITEM_TYPE].keys() context_headers = [ 'context:%s' % category for category in context_categories] index = 0 for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } review_categories = \ find_predefined_context(record[Constants.BOW_FIELD]) context_found = False for category in context_categories: category_key = 'context:' + category category_value = 0 if category in review_categories: category_value = 1 context_found = True new_record[category_key] = category_value context_na_value = 0 if context_found else 1 new_record['context:na'] = context_na_value new_records.append(new_record) index += 1 headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] headers.extend(context_headers) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def run_rmse_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\n%d/%d\n**************' % (count, len(recommenders))) results = recommender_evaluator.perform_cross_validation( records, recommender, num_folds, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append( process_rmse_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-rmse-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS, '\t')
def export_as_top_word(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'top words' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] topic_model_string = self.topic_extractor.print_topic_model() top_terms = [get_topic_terms(topic) for topic in topic_model_string] context_headers = ['context:%s' % term[0] for term in top_terms] for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } topics = record[self.topics_field] context_found = False for topic in topics: topic_index = topic[0] topic_weight = topic[1] context_key = context_headers[topic_index] context_value = 1 if topic_weight > 0.0 else 0 new_record[context_key] = context_value # print(new_record) context_na_value = 0 if context_found else 1 new_record['context:na'] = context_na_value new_records.append(new_record) headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] headers.extend(context_headers) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def run_rmse_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\n%d/%d\n**************' % (count, len(recommenders))) results = recommender_evaluator.perform_cross_validation( records, recommender, num_folds, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append(process_rmse_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-rmse-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS, '\t')
def export_as_all_words(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'all words' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] all_terms = set() for record in self.records: all_terms |= set(record[Constants.BOW_FIELD]) all_terms = [remove_accents(term) for term in all_terms] context_headers = ['context:%s' % term for term in all_terms] for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } bag_of_words = record[Constants.BOW_FIELD] for term, context_header in zip(all_terms, context_headers): context_value = 1 if term in bag_of_words > 0.0 else 0 new_record[context_header] = context_value new_records.append(new_record) headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, ] headers.extend(context_headers) print(len(headers)) print(headers) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def add_extra_column_to_csv(): csv_file_name = '/tmp/results/rival_yelp_restaurant_results_folds_4.csv' records = ETLUtils.load_csv_file(csv_file_name) with open(csv_file_name, 'r') as csvinput: reader = csv.reader(csvinput) headers = next(reader) index = headers.index('Evaluation_Set') + 1 headers.insert(index, Constants.FM_NUM_FACTORS_FIELD) print(headers) for record in records: record[Constants.FM_NUM_FACTORS_FIELD] = 10 ETLUtils.save_csv_file('/tmp/my_csv_file.csv', records, headers)
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence(document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def prepare(self): print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(len(self.sense_groups)) if Constants.USE_CONTEXT is True: for record in self.train_records: record.update(record[Constants.CONTEXT_WORDS_FIELD]) for record in self.records_to_predict: record.update(record[Constants.CONTEXT_WORDS_FIELD]) if Constants.FM_REVIEW_TYPE: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.FM_REVIEW_TYPE]) # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records) ETLUtils.keep_fields(self.headers, self.train_records) ETLUtils.keep_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, self.train_records, self.headers) ETLUtils.save_csv_file( self.csv_test_file, self.records_to_predict, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence( document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def main_converter(): csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv' csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv' # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True) # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True) headers = ['stars', 'user_id', 'business_id'] train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE) train_records = ETLUtils.select_fields(headers, train_records) records_to_predict = ETLUtils.select_fields(headers, records_to_predict) ETLUtils.save_csv_file(csv_train_file, train_records, headers) ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers) csv_files = [ csv_train_file, csv_test_file ] csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
def evaluate_recommender_similarity_metrics(reviews, recommender): headers = [ 'Algorithm', 'Multi-cluster', 'Similarity algorithm', 'Similarity metric', 'Num neighbors', 'Dataset', 'MAE', 'RMSE', 'Top N', 'Coverage', 'Execution time', 'Cross validation', 'Machine' ] similarity_metrics = ['euclidean'] # , 'cosine', 'chebyshev', 'manhattan', 'pearson'] similarity_algorithms = [ SingleSimilarityMatrixBuilder('euclidean'), # AverageSimilarityMatrixBuilder('euclidean'), # MultiSimilarityMatrixBuilder('euclidean'), ] ranges = [ # [(-1.001, -0.999), (0.999, 1.001)], # [(-1.01, -0.99), (0.99, 1.01)], # [(-1.05, -0.95), (0.95, 1.05)], # [(-1.1, -0.9), (0.9, 1.1)], # [(-1.2, -0.8), (0.8, 1.2)], # [(-1.3, -0.7), (0.7, 1.3)], # [(-1.5, -0.5), (0.5, 1.5)], # [(-1.7, -0.3), (0.3, 1.7)], # [(-1.9, -0.1), (0.1, 1.9)], None ] num_neighbors_list = [None] # [None, 1, 3, 5, 10, 20, 30, 40] num_folds = 5 results = [] for similarity_algorithm in similarity_algorithms: for num_neighbors in num_neighbors_list: for similarity_metric in similarity_metrics: for cluster_range in ranges: recommender._similarity_matrix_builder = similarity_algorithm recommender._similarity_matrix_builder._similarity_metric = similarity_metric recommender._significant_criteria_ranges = cluster_range recommender._num_neighbors = num_neighbors print( recommender.name, recommender._significant_criteria_ranges, recommender._similarity_matrix_builder._name, recommender._similarity_matrix_builder._similarity_metric, recommender._num_neighbors ) result = perform_cross_validation(reviews, recommender, num_folds) # result = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5) # result['Top N'] = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)['Top N'] result['Algorithm'] = recommender.name result['Multi-cluster'] = recommender._significant_criteria_ranges result['Similarity algorithm'] = recommender._similarity_matrix_builder._name result['Similarity metric'] = recommender._similarity_matrix_builder._similarity_metric result['Cross validation'] = 'Folds=' + str(num_folds) + ', Iterations = ' + str(num_folds) result['Num neighbors'] = recommender._num_neighbors result['Specific/Generic'] = recommender._num_neighbors result['Dataset'] = 'Four City' result['Machine'] = 'Mac' results.append(result) file_name = '/Users/fpena/tmp/rs-test/test-delete-' + recommender.name + '.csv' ETLUtils.save_csv_file(file_name, results, headers)
def export_as_topic_predefined_context(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] context_categories = utilities.context_words[Constants.ITEM_TYPE].keys() context_headers = [ 'context:%s' % category for category in context_categories] context_topic_ids = [ extract_topic_id(topic_name) for topic_name in self.records[0][Constants.CONTEXT_TOPICS_FIELD].keys()] context_topic_ids = [ topic for topic in context_topic_ids if topic is not None] topic_categories_map = \ create_topic_categories_map(context_topic_ids, self.topic_extractor) print(topic_categories_map) index = 0 # for record in self.records[3:4]: for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } context_topics = record[Constants.CONTEXT_TOPICS_FIELD] # topic_categories = \ # find_predefined_context(record[Constants.BOW_FIELD]) context_found = False for category in context_categories: category_key = 'context:' + category category_value = 0 for topic_name in context_topics.keys(): topic_id = extract_topic_id(topic_name) if topic_id is None: break topic_categories = topic_categories_map[topic_id] if context_topics[topic_name] > 0 and category in topic_categories: category_value = 1 context_found = True new_record[category_key] = category_value context_na_value = 0 if context_found else 1 new_record['context:na'] = context_na_value new_records.append(new_record) index += 1 headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] headers.extend(context_headers) print(new_records[0]) # print(new_records[10]) # print(new_records[100]) # record_index = 0 # all_context_headers = context_headers + ['context:na'] # for record in new_records: # # context_sum = 0 # for header in all_context_headers: # context_sum += record[header] # record_index += 1 # print(record_index, context_sum) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def evaluate_recommender_similarity_metrics(reviews, recommender): headers = [ "Algorithm", "Multi-cluster", "Similarity algorithm", "Similarity metric", "Num neighbors", "Dataset", "MAE", "RMSE", "Top N", "Coverage", "Execution time", "Cross validation", "Machine", ] similarity_metrics = ["euclidean"] # , 'cosine', 'chebyshev', 'manhattan', 'pearson'] similarity_algorithms = [ SingleSimilarityMatrixBuilder("euclidean"), # AverageSimilarityMatrixBuilder('euclidean'), # MultiSimilarityMatrixBuilder('euclidean'), ] ranges = [ # [(-1.001, -0.999), (0.999, 1.001)], # [(-1.01, -0.99), (0.99, 1.01)], # [(-1.05, -0.95), (0.95, 1.05)], # [(-1.1, -0.9), (0.9, 1.1)], # [(-1.2, -0.8), (0.8, 1.2)], # [(-1.3, -0.7), (0.7, 1.3)], # [(-1.5, -0.5), (0.5, 1.5)], # [(-1.7, -0.3), (0.3, 1.7)], # [(-1.9, -0.1), (0.1, 1.9)], None ] num_neighbors_list = [None] # [None, 1, 3, 5, 10, 20, 30, 40] num_folds = 5 results = [] for similarity_algorithm in similarity_algorithms: for num_neighbors in num_neighbors_list: for similarity_metric in similarity_metrics: for cluster_range in ranges: recommender._similarity_matrix_builder = similarity_algorithm recommender._similarity_matrix_builder._similarity_metric = similarity_metric recommender._significant_criteria_ranges = cluster_range recommender._num_neighbors = num_neighbors print( recommender.name, recommender._significant_criteria_ranges, recommender._similarity_matrix_builder._name, recommender._similarity_matrix_builder._similarity_metric, recommender._num_neighbors, ) result = perform_cross_validation(reviews, recommender, num_folds) # result = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5) # result['Top N'] = precision_in_top_n.calculate_top_n_precision(reviews, recommender, 5, 4.0, 5)['Top N'] result["Algorithm"] = recommender.name result["Multi-cluster"] = recommender._significant_criteria_ranges result["Similarity algorithm"] = recommender._similarity_matrix_builder._name result["Similarity metric"] = recommender._similarity_matrix_builder._similarity_metric result["Cross validation"] = "Folds=" + str(num_folds) + ", Iterations = " + str(num_folds) result["Num neighbors"] = recommender._num_neighbors result["Specific/Generic"] = recommender._num_neighbors result["Dataset"] = "Four City" result["Machine"] = "Mac" results.append(result) file_name = "/Users/fpena/tmp/rs-test/test-delete-" + recommender.name + ".csv" ETLUtils.save_csv_file(file_name, results, headers)