def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-c', '--cycle', metavar='int', type=int, nargs=1, help='The index of the running cycle') parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() fold = args.fold[0] if args.fold is not None else None cycle = args.cycle[0] if args.cycle is not None else None num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if fold is None and cycle is None: records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) create_topic_model(records, None, None) else: create_single_topic_model(cycle, fold)
def preprocess_data(): my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) reviews_preprocessor = ReviewsPreprocessor(use_cache=True) reviews_preprocessor.full_cycle()
def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def manual_main(): csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from evaluation.context_top_n_runner import ContextTopNRunner print('\n\n************************\n************************\n') print('args', args) # Cast integer values args[Constants.FM_ITERATIONS_FIELD] = \ int(args[Constants.FM_ITERATIONS_FIELD]) args[Constants.FM_NUM_FACTORS_FIELD] = \ int(args[Constants.FM_NUM_FACTORS_FIELD]) if args[Constants.USE_CONTEXT_FIELD]: args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \ int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]) args[Constants.TOPIC_MODEL_PASSES_FIELD] = \ int(args[Constants.TOPIC_MODEL_PASSES_FIELD]) args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \ int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]) Constants.update_properties(args) # Finish updating parameters my_context_top_n_runner = ContextTopNRunner() results = my_context_top_n_runner.run() results['loss'] = -results[Constants.EVALUATION_METRIC] results['status'] = 'ok' print('loss', results['loss']) return results
def create_topic_models(): my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) reviews_preprocessor = ReviewsPreprocessor(use_cache=True) reviews_preprocessor.full_cycle()
def calculate_topic_stability(records): Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = [] context_extractor =\ topic_model_creator.create_topic_model(records, None, None) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = 0.8 print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS) for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1): sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return calculate_stability(all_term_rankings)
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from evaluation.context_top_n_runner import ContextTopNRunner print('\n\n************************\n************************\n') print('args', args) # Cast integer values args[Constants.FM_ITERATIONS_FIELD] = \ int(args[Constants.FM_ITERATIONS_FIELD]) args[Constants.FM_NUM_FACTORS_FIELD] = \ int(args[Constants.FM_NUM_FACTORS_FIELD]) if args[Constants.USE_CONTEXT_FIELD]: args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \ int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]) args[Constants.TOPIC_MODEL_PASSES_FIELD] = \ int(args[Constants.TOPIC_MODEL_PASSES_FIELD]) args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \ int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]) Constants.update_properties(args) # Finish updating parameters my_context_top_n_runner = ContextTopNRunner() results = my_context_top_n_runner.run() results['loss'] = -results[Constants.EVALUATION_METRIC] results['status'] = 'ok' print('loss', results['loss']) return results
def cli_main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def manual_main(): csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def full_cycle(): num_topics_list = [5, 10, 20, 40] # bow_type_list = [None, 'NN', 'JJ', 'VB'] review_type_list = ['specific', 'generic'] # num_topics_list = [10] bow_type_list = ['NN'] results = [] for num_topics, bow_type, review_type in itertools.product( num_topics_list, bow_type_list, review_type_list): Constants.update_properties({ Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) result = analyze_topics() result.update({ Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) results.append(result) for result in results: print(result) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \ '_topic_model_context_richness' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys())
def cycle_eval_topic_model(metric, num_topics_list): csv_file_name = Constants.generate_file_name(metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) for topic in num_topics_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic}) results = run_eval_topic_model(metric) topic_model_analyzer.write_results_to_csv(csv_file_name, results)
def cycle_eval_topic_model(metric, num_topics_list): csv_file_name = Constants.generate_file_name( metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) for topic in num_topics_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic}) results = run_eval_topic_model(metric) topic_model_analyzer.write_results_to_csv(csv_file_name, results)
def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics
def run_tests(): combined_parameters = parameter_combinator.get_combined_parameters() test_cycle = 1 num_tests = len(combined_parameters) for properties in combined_parameters: Constants.update_properties(properties) context_top_n_runner = WordContextTopNRunner() print('\n\n******************\nTest %d/%d\n******************\n' % (test_cycle, num_tests)) context_top_n_runner.perform_cross_validation() test_cycle += 1
def run_tests(): combined_parameters = parameter_combinator.hotel_context_parameters() test_cycle = 1 num_tests = len(combined_parameters) for properties in combined_parameters: Constants.update_properties(properties) context_top_n_runner = ContextTopNRunner() print('\n\n******************\nTest %d/%d\n******************\n' % (test_cycle, num_tests)) context_top_n_runner.perform_cross_validation() test_cycle += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') args = parser.parse_args() fold = args.fold[0] new_properties = { Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold, Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate' } Constants.update_properties(new_properties) context_top_n_runner.run_tests()
def topic_stability_main(): records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # num_topic_list = range(2, 101) num_topic_list = [2, 5] results = {} for num_topics in num_topic_list: new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics} Constants.update_properties(new_properties) results[num_topics] = calculate_topic_stability(records) print('Results:') for num_topics in num_topic_list: scores = results[num_topics] print('%d: %.4f [%.4f,%.4f]' % (num_topics, numpy.nanmean(scores), numpy.nanmin(scores), numpy.nanmax(scores)))
def create_all_term_rankings(records, metric): print('%s: creating all term rankings' % time.strftime("%Y/%m/%d-%H:%M:%S")) all_term_rankings = [] # context_extractor =\ # topic_model_creator.create_topic_model(records, None, None) # terms_matrix = get_topic_model_terms( # context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) # all_term_rankings.append(terms_matrix) context_extractor = \ topic_model_creator.train_context_extractor(records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]: sample_ratio = None Constants.update_properties( {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio}) msg = 'Warning: Since the metric is \'%s\' I have updated the ' \ 'topic_model_stability_sample_ratio value to None' % metric print(msg) num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS for i in range(num_iterations - 1): print('Iteration %d/%d' % (i + 1, num_iterations)) print('sample_ratio:', sample_ratio) if sample_ratio is None: sampled_records = records else: sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return all_term_rankings
def create_all_term_rankings(records, metric): print('%s: creating all term rankings' % time.strftime("%Y/%m/%d-%H:%M:%S")) all_term_rankings = [] # context_extractor =\ # topic_model_creator.create_topic_model(records, None, None) # terms_matrix = get_topic_model_terms( # context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) # all_term_rankings.append(terms_matrix) context_extractor = \ topic_model_creator.train_context_extractor(records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]: sample_ratio = None Constants.update_properties( {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio}) msg = 'Warning: Since the metric is \'%s\' I have updated the ' \ 'topic_model_stability_sample_ratio value to None' % metric print(msg) num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS for i in range(num_iterations - 1): print('Iteration %d/%d' % (i+1, num_iterations)) print('sample_ratio:', sample_ratio) if sample_ratio is None: sampled_records = records else: sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return all_term_rankings
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence(document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') args = parser.parse_args() fold = args.fold[0] new_properties = { Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold, Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate' } Constants.update_properties(new_properties) context_top_n_runner.run_tests()
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence( document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def create_topic_model(num_topics): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics }) utilities.plant_seeds() Constants.print_properties() file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \ "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS if os.path.exists(file_path): print('Ensemble topic model already exists') return # topic_ensemble_caller.run_local_parse_directory() topic_ensemble_caller.run_generate_kfold() topic_ensemble_caller.run_combine_nmf()
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from topicmodeling.context import topic_model_analyzer print('\n\n************************\n************************\n') print('args', args) parameters = { Constants.BUSINESS_TYPE_FIELD: args[Constants.BUSINESS_TYPE_FIELD], # 'lda_alpha': args['lda_alpha'], # 'lda_beta': args['lda_beta'], Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD], Constants.TOPIC_MODEL_ITERATIONS_FIELD: int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]), Constants.TOPIC_MODEL_PASSES_FIELD: int(args[Constants.TOPIC_MODEL_PASSES_FIELD]), Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]), # 'topic_weighting_method': args['topic_weighting_method'], Constants.USE_CONTEXT_FIELD: args[Constants.USE_CONTEXT_FIELD] } Constants.update_properties(parameters) # Finish updating parameters results = topic_model_analyzer.export_topics() results['loss'] = -results['combined_score'] results['status'] = 'ok' print('loss', results['loss']) return results
def create_topic_model_with_context_records(): processed_records_file = Constants.generate_file_name( 'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None, None, False, True) records = ETLUtils.load_json_file(processed_records_file) print('records length: %d' % len(records)) context_records = ETLUtils.filter_records(records, 'context_type', ['context']) print('context records length: %d' % len(context_records)) context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific']) print('context specific records length: %d' % len(context_specific_records)) for i in range(len(context_specific_records)): # print('%d:\t%s' % (i, context_records[i]['text'])) print('%d:\t%s' % (i, context_specific_records[i]['bow'])) for i in range(1, len(context_records)+1): Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) context_extractor = \ topic_model_creator.create_topic_model(records, None, None) topic_data = [] for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS): result = {} result['topic_id'] = topic result.update(split_topic(context_extractor.print_topic_model( num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) file_name = Constants.generate_file_name( 'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True) generate_excel_file(topic_data, file_name)
def run_tests(): combined_parameters = parameter_combinator.get_combined_parameters() test_cycle = 1 num_tests = len(combined_parameters) highest_value = -1 best_parameters = None for properties in combined_parameters: Constants.update_properties(properties) context_top_n_runner = ContextTopNRunner() print('\n\n******************\nTest %d/%d\n******************\n' % (test_cycle, num_tests)) results = context_top_n_runner.run() if results[Constants.EVALUATION_METRIC] > highest_value: highest_value = results[Constants.EVALUATION_METRIC] best_parameters = properties test_cycle += 1 print('highest %s: %f' % (Constants.EVALUATION_METRIC, highest_value)) print(best_parameters)
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def create_topic_model(num_topics): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics }) utilities.plant_seeds() Constants.print_properties() file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \ "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS if os.path.exists(file_path): print('Ensemble topic model already exists') return # topic_ensemble_caller.run_local_parse_directory() topic_ensemble_caller.run_generate_kfold() topic_ensemble_caller.run_combine_nmf()
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from topicmodeling.context import topic_model_analyzer print('\n\n************************\n************************\n') print('args', args) parameters = { Constants.BUSINESS_TYPE_FIELD: args[Constants.BUSINESS_TYPE_FIELD], # 'lda_alpha': args['lda_alpha'], # 'lda_beta': args['lda_beta'], Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD], Constants.TOPIC_MODEL_ITERATIONS_FIELD: int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]), Constants.TOPIC_MODEL_PASSES_FIELD: int(args[Constants.TOPIC_MODEL_PASSES_FIELD]), Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]), # 'topic_weighting_method': args['topic_weighting_method'], Constants.USE_CONTEXT_FIELD: args[Constants.USE_CONTEXT_FIELD] } Constants.update_properties(parameters) # Finish updating parameters results = topic_model_analyzer.export_topics() results['loss'] = -results['combined_score'] results['status'] = 'ok' print('loss', results['loss']) return results
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError( 'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def full_cycle(): num_topics_list = [5, 10, 20, 40] # bow_type_list = [None, 'NN', 'JJ', 'VB'] review_type_list = ['specific', 'generic'] # num_topics_list = [10] bow_type_list = ['NN'] results = [] for num_topics, bow_type, review_type in itertools.product( num_topics_list, bow_type_list, review_type_list): Constants.update_properties({ Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) result = analyze_topics() result.update({ Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) results.append(result) for result in results: print(result) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \ '_topic_model_context_richness' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys())
def export_topics(cycle_index, fold_index, epsilon=None, alpha=None): topic_model_creator.plant_seeds() new_properties = copy.deepcopy(Constants._properties) if epsilon is not None: new_properties['lda_epsilon'] = epsilon if alpha is not None: new_properties['lda_alpha'] = alpha Constants.update_properties(new_properties) lda_based_context = load_topic_model(cycle_index, fold_index) file_name = Constants.DATASET_FOLDER + 'all_reviews_topic_model_' + \ Constants.ITEM_TYPE + '_' + \ str(Constants.LDA_NUM_TOPICS) + '_' + \ str(Constants.LDA_MODEL_PASSES) + '_' + \ str(Constants.LDA_MODEL_ITERATIONS) + '_' + \ str(Constants.LDA_EPSILON) + \ '-nouns-complete.csv' print(file_name) num_words = 10 headers = [ 'topic_id', 'ratio', 'score', # 'words_ratio', # 'past_verbs_ratio', # 'frq', # 'specific_frq', # 'generic_frq', # 'log_words', # 'specific_log_words', # 'generic_log_words', # 'log_past_verbs', # 'specific_log_past_verbs', # 'generic_log_past_verbs' ] for i in range(num_words): headers.append('word' + str(i)) results = [] topic_statistics_map = lda_based_context.topic_statistics_map topic_ratio_map = lda_based_context.topic_ratio_map num_reviews = len(lda_based_context.records) num_specific_reviews = len(lda_based_context.specific_reviews) num_generic_reviews = len(lda_based_context.generic_reviews) print('num reviews: %d' % num_reviews) print('num specific reviews: %d' % num_specific_reviews) print('num generic reviews: %d' % num_generic_reviews) print('specific reviews percentage : %f %%' % (float(num_specific_reviews) / num_reviews * 100)) print('generic reviews percentage : %f %%' % (float(num_generic_reviews) / num_reviews * 100)) print('number of contextual topics: %d' % len(lda_based_context.context_rich_topics)) for topic in topic_ratio_map.keys(): result = {} result['topic_id'] = topic result['ratio'] = topic_ratio_map[topic] result.update(split_topic( lda_based_context.topic_model.print_topic(topic, topn=num_words))) results.append(result) # for topic in topic_statistics_map.keys(): # # # pri # # result = {} # result['topic_id'] = topic # result['ratio'] = topic_statistics_map[topic]['frequency_ratio'] # result['words_ratio'] = topic_statistics_map[topic]['words_ratio'] # result['past_verbs_ratio'] = topic_statistics_map[topic]['past_verbs_ratio'] # result['frq'] = topic_statistics_map[topic]['weighted_frq']['review_frequency'] # result['specific_frq'] = topic_statistics_map[topic]['specific_weighted_frq']['review_frequency'] # result['generic_frq'] = topic_statistics_map[topic]['generic_weighted_frq']['review_frequency'] # result['log_words'] = topic_statistics_map[topic]['weighted_frq']['log_words_frequency'] # result['specific_log_words'] = topic_statistics_map[topic]['specific_weighted_frq']['log_words_frequency'] # result['generic_log_words'] = topic_statistics_map[topic]['generic_weighted_frq']['log_words_frequency'] # result['log_past_verbs'] = topic_statistics_map[topic]['weighted_frq']['log_past_verbs_frequency'] # result['specific_log_past_verbs'] = topic_statistics_map[topic]['specific_weighted_frq']['log_past_verbs_frequency'] # result['generic_log_past_verbs'] = topic_statistics_map[topic]['generic_weighted_frq']['log_past_verbs_frequency'] # result.update(split_topic(lda_based_context.topic_model.print_topic(topic, topn=num_words))) # # # print(lda_based_context.topic_model.print_topic(topic, topn=num_words)) # results.append(result) analyze_topics(results, lda_based_context)
def main(): csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(csv_file_name) # export_lda_topics(0, 0) # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5] epsilon_list = [0.05] alpha_list = [0.0] # num_topics_list =\ # [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800] # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300] # num_topics_list = [150, 300] num_topics_list = range(1, 51) bow_type_list = ['NN'] # document_level_list = ['review', 'sentence', 1] document_level_list = [1] # topic_weighting_methods = ['binary', 'probability'] topic_weighting_methods = ['probability'] # review_type_list = ['specific', 'generic', 'all_reviews'] review_type_list = ['specific'] # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500] # lda_passes_list = [1, 10] lda_passes_list = [100] # lda_iterations_list = [50, 100, 200, 400, 800, 2000] # lda_iterations_list = [50, 100, 200, 500] lda_iterations_list = [200] # topic_model_type_list = ['lda', 'nmf'] topic_model_type_list = ['nmf'] num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\ len(document_level_list) * len(topic_weighting_methods) *\ len(review_type_list) * len(lda_passes_list) *\ len(lda_iterations_list) * len(topic_model_type_list) *\ len(bow_type_list) cycle_index = 1 for epsilon, alpha, num_topics, document_level, topic_weighting_method,\ review_type, lda_passes, lda_iterations, topic_model_type,\ bow_type in itertools.product( epsilon_list, alpha_list, num_topics_list, document_level_list, topic_weighting_methods, review_type_list, lda_passes_list, lda_iterations_list, topic_model_type_list, bow_type_list): print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.DOCUMENT_LEVEL_FIELD: document_level, Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method, Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha, Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon, Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type, Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes, Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations, Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type, Constants.BOW_TYPE_FIELD: bow_type } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') parser.add_argument('-i', '--itemtype', metavar='string', type=str, nargs=1, help='The type of items') parser.add_argument( '-s', '--strategy', metavar='string', type=str, nargs=1, help='The evaluation strategy (user_test or rel_plus_n)') parser.add_argument('-e', '--evaluationset', metavar='string', type=str, nargs=1, help='The evaluation set') parser.add_argument( '-cf', '--contextformat', metavar='string', type=str, nargs=1, help='The strategy to extract the contextual information') parser.add_argument('-a', '--algorithm', metavar='string', type=str, nargs=1, help='The algorithm used to produce recommendations') parser.add_argument('-cp', '--carskitparams', metavar='string', type=str, nargs=1, help='The hyperparameters for the CARSKit model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None item_type = args.itemtype[0] if args.itemtype is not None else None strategy =\ args.strategy[0] if args.strategy is not None else None evaluation_set =\ args.evaluationset[0] if args.evaluationset is not None else None context_format =\ args.contextformat[0] if args.contextformat is not None else None algorithm =\ args.algorithm[0] if args.algorithm is not None else 'libfm' carskit_params =\ args.carskitparams[0] if args.carskitparams is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if item_type is not None: Constants.update_properties({Constants.BUSINESS_TYPE_FIELD: item_type}) if context_format is not None: Constants.update_properties( {Constants.CONTEXT_FORMAT_FIELD: context_format}) if strategy is not None: Constants.update_properties( {Constants.RIVAL_EVALUATION_STRATEGY_FIELD: strategy}) if carskit_params is not None: Constants.update_properties( {Constants.CARSKIT_PARAMETERS_FIELD: carskit_params}) if algorithm.startswith('carskit_'): carskit_recommender = algorithm.split('carskit_')[1] Constants.update_properties( {'carskit_recommenders': carskit_recommender}) full_cycle_carskit(evaluation_set) elif algorithm == 'libfm': full_cycle_libfm(evaluation_set) else: raise ValueError('Unknown algorithm \'%s\'' % algorithm)
def main(): # modify_properties_file() # run_carskit() if Constants.CARSKIT_ITEM_RANKING: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', # 'bpr', # 'rankals', 'ranksgd', 'bpr', 'lrmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', # 'camf_ics', ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', ##'cslim_mcs', 'gcslim_ics' ] else: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'bpmf', ] slow_recommenders = [ # 'contextavg', 'itemcontextavg', 'usercontextavg', # broken without context ## 'itemknn', ## 'userknn', ## 'cptf', ## 'gcslim_cc', ## 'gcslim_lcs', ## 'gcslim_mcs', ## 'fm' ] rating_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'camf_ics', 'camf_lcs', 'camf_mcs', 'fm' ] rank_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', # 'camf_ics', 'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs', 'fm' ] rank_only_recommenders = [ 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs' ] recommenders = all_recommenders print('num recommenders: %d' % len(recommenders)) index = 1 for recommender in recommenders: print('cycle %d/%d' % (index, len(recommenders))) print('Recommender: %s' % recommender) Constants.update_properties({'carskit_recommenders': recommender}) index += 1 cycle_start = time.time() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS for fold in range(num_folds): modify_properties_file(fold) full_cycle(fold) cycle_end = time.time() cycle_time = cycle_end - cycle_start print("Cycle time = %f seconds" % cycle_time)
def old_main(): # modify_properties_file() # run_carskit() # baseline-Avg recommender: GlobalAvg, UserAvg, ItemAvg, UserItemAvg # baseline-Context average recommender: ContextAvg, ItemContextAvg, UserContextAvg # baseline-CF recommender: ItemKNN, UserKNN, SlopeOne, PMF, BPMF, BiasedMF, NMF, SVD++ # baseline-Top-N ranking recommender: SLIM, BPR, RankALS, RankSGD, LRMF # CARS - splitting approaches: UserSplitting, ItemSplitting, UISplitting; algorithm options: e.g., usersplitting -traditional biasedmf -minlenu 2 -minleni 2 # CARS - filtering approaches: SPF, DCR, DCW # CARS - independent models: CPTF # CARS - dependent-dev models: CAMF_CI, CAMF_CU, CAMF_C, CAMF_CUCI, CSLIM_C, CSLIM_CI, CSLIM_CU, CSLIM_CUCI, GCSLIM_CC # CARS - dependent-sim models: CAMF_ICS, CAMF_LCS, CAMF_LCS, CSLIM_ICS, CSLIM_LCS, CSLIM_MCS, GCSLIM_ICS, GCSLIM_LCS, GCSLIM_MCS if Constants.CARSKIT_ITEM_RANKING: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', # 'bpr', # 'rankals', 'ranksgd', 'bpr', 'lrmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', # 'camf_ics', ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', ##'cslim_mcs', 'gcslim_ics' ] else: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'bpmf', ] slow_recommenders = [ # 'contextavg', 'itemcontextavg', 'usercontextavg', # broken without context ## 'itemknn', ## 'userknn', ## 'cptf', ## 'gcslim_cc', ## 'gcslim_lcs', ## 'gcslim_mcs', ## 'fm' ] rating_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'camf_ics', 'camf_lcs', 'camf_mcs', 'fm' ] rank_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', # 'camf_ics', 'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs', 'fm' ] rank_only_recommenders = [ 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs' ] # recommenders = all_recommenders recommenders = ['CAMF_CU'] print('num recommenders: %d' % len(recommenders)) index = 1 for recommender in recommenders: print('cycle %d/%d' % (index, len(recommenders))) print('Recommender: %s' % recommender) Constants.update_properties({'carskit_recommenders': recommender}) index += 1 cycle_start = time.time() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS for fold in range(num_folds): # modify_properties_file(fold) full_cycle(fold) cycle_end = time.time() cycle_time = cycle_end - cycle_start print("Cycle time = %f seconds" % cycle_time)
def old_main(): # modify_properties_file() # run_carskit() # baseline-Avg recommender: GlobalAvg, UserAvg, ItemAvg, UserItemAvg # baseline-Context average recommender: ContextAvg, ItemContextAvg, UserContextAvg # baseline-CF recommender: ItemKNN, UserKNN, SlopeOne, PMF, BPMF, BiasedMF, NMF, SVD++ # baseline-Top-N ranking recommender: SLIM, BPR, RankALS, RankSGD, LRMF # CARS - splitting approaches: UserSplitting, ItemSplitting, UISplitting; algorithm options: e.g., usersplitting -traditional biasedmf -minlenu 2 -minleni 2 # CARS - filtering approaches: SPF, DCR, DCW # CARS - independent models: CPTF # CARS - dependent-dev models: CAMF_CI, CAMF_CU, CAMF_C, CAMF_CUCI, CSLIM_C, CSLIM_CI, CSLIM_CU, CSLIM_CUCI, GCSLIM_CC # CARS - dependent-sim models: CAMF_ICS, CAMF_LCS, CAMF_LCS, CSLIM_ICS, CSLIM_LCS, CSLIM_MCS, GCSLIM_ICS, GCSLIM_LCS, GCSLIM_MCS if Constants.CARSKIT_ITEM_RANKING: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', # 'bpr', # 'rankals', 'ranksgd', 'bpr', 'lrmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', # 'camf_ics', ##'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', ##'cslim_mcs', 'gcslim_ics' ] else: all_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'camf_ci', 'camf_cu', # 'camf_c', 'camf_cuci', 'bpmf', ] slow_recommenders = [ # 'contextavg', 'itemcontextavg', 'usercontextavg', # broken without context ## 'itemknn', ## 'userknn', ## 'cptf', ## 'gcslim_cc', ## 'gcslim_lcs', ## 'gcslim_mcs', ## 'fm' ] rating_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'biasedmf', 'nmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'camf_ics', 'camf_lcs', 'camf_mcs', 'fm' ] rank_recommenders = [ 'globalavg', 'useravg', 'itemavg', 'useritemavg', # 'contextavg', 'itemcontextavg', 'usercontextavg', 'itemknn', 'userknn', 'slopeone', 'pmf', 'bpmf', 'biasedmf', 'nmf', 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cptf', 'camf_ci', 'camf_cu', 'camf_c', 'camf_cuci', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', # 'camf_ics', 'camf_lcs', 'camf_mcs', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs', 'fm' ] rank_only_recommenders = [ 'slim', 'bpr', # 'rankals', 'ranksgd', 'lrmf', 'cslim_c', 'cslim_ci', 'cslim_cu', # 'cslim_cuci', 'gcslim_cc', 'cslim_ics', 'cslim_lcs', 'cslim_mcs', 'gcslim_ics', 'gcslim_lcs', 'gcslim_mcs' ] # recommenders = all_recommenders recommenders = ['CAMF_CU'] print('num recommenders: %d' % len(recommenders)) index = 1 for recommender in recommenders: print('cycle %d/%d' % (index, len(recommenders))) print('Recommender: %s' % recommender) Constants.update_properties({'carskit_recommenders': recommender}) index += 1 cycle_start = time.time() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS for fold in range(num_folds): # modify_properties_file(fold) full_cycle(fold) cycle_end = time.time() cycle_time = cycle_end - cycle_start print("Cycle time = %f seconds" % cycle_time)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-k', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') parser.add_argument( '-i', '--itemtype', metavar='string', type=str, nargs=1, help='The type of items') parser.add_argument( '-s', '--strategy', metavar='string', type=str, nargs=1, help='The evaluation strategy (user_test or rel_plus_n)') parser.add_argument( '-e', '--evaluationset', metavar='string', type=str, nargs=1, help='The evaluation set') parser.add_argument( '-cf', '--contextformat', metavar='string', type=str, nargs=1, help='The strategy to extract the contextual information') parser.add_argument( '-a', '--algorithm', metavar='string', type=str, nargs=1, help='The algorithm used to produce recommendations') parser.add_argument( '-cp', '--carskitparams', metavar='string', type=str, nargs=1, help='The hyperparameters for the CARSKit model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None item_type = args.itemtype[0] if args.itemtype is not None else None strategy =\ args.strategy[0] if args.strategy is not None else None evaluation_set =\ args.evaluationset[0] if args.evaluationset is not None else None context_format =\ args.contextformat[0] if args.contextformat is not None else None algorithm =\ args.algorithm[0] if args.algorithm is not None else 'libfm' carskit_params =\ args.carskitparams[0] if args.carskitparams is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if item_type is not None: Constants.update_properties( {Constants.BUSINESS_TYPE_FIELD: item_type}) if context_format is not None: Constants.update_properties( {Constants.CONTEXT_FORMAT_FIELD: context_format}) if strategy is not None: Constants.update_properties( {Constants.RIVAL_EVALUATION_STRATEGY_FIELD: strategy}) if carskit_params is not None: Constants.update_properties( {Constants.CARSKIT_PARAMETERS_FIELD: carskit_params}) if algorithm.startswith('carskit_'): carskit_recommender = algorithm.split('carskit_')[1] Constants.update_properties( {'carskit_recommenders': carskit_recommender}) full_cycle_carskit(evaluation_set) elif algorithm == 'libfm': full_cycle_libfm(evaluation_set) else: raise ValueError('Unknown algorithm \'%s\'' % algorithm)
def main(): csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(csv_file_name) # export_lda_topics(0, 0) # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5] epsilon_list = [0.05] alpha_list = [0.0] # num_topics_list =\ # [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800] # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300] # num_topics_list = [150, 300] num_topics_list = range(1, 51) bow_type_list = ['NN'] # document_level_list = ['review', 'sentence', 1] document_level_list = [1] # topic_weighting_methods = ['binary', 'probability'] topic_weighting_methods = ['probability'] # review_type_list = ['specific', 'generic', 'all_reviews'] review_type_list = ['specific'] # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500] # lda_passes_list = [1, 10] lda_passes_list = [100] # lda_iterations_list = [50, 100, 200, 400, 800, 2000] # lda_iterations_list = [50, 100, 200, 500] lda_iterations_list = [200] # topic_model_type_list = ['lda', 'nmf'] topic_model_type_list = ['nmf'] num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\ len(document_level_list) * len(topic_weighting_methods) *\ len(review_type_list) * len(lda_passes_list) *\ len(lda_iterations_list) * len(topic_model_type_list) *\ len(bow_type_list) cycle_index = 1 for epsilon, alpha, num_topics, document_level, topic_weighting_method,\ review_type, lda_passes, lda_iterations, topic_model_type,\ bow_type in itertools.product( epsilon_list, alpha_list, num_topics_list, document_level_list, topic_weighting_methods, review_type_list, lda_passes_list, lda_iterations_list, topic_model_type_list, bow_type_list): print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.DOCUMENT_LEVEL_FIELD: document_level, Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method, Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha, Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon, Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type, Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes, Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations, Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type, Constants.BOW_TYPE_FIELD: bow_type } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1