def cli_main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def manual_main(): csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def manual_main(): csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def summarize_results(metrics_list): metric_name = Constants.EVALUATION_METRIC specific_metric_name = Constants.SPECIFIC + '_' + metric_name generic_metric_name = Constants.GENERIC + '_' + metric_name has_context_metric_name = Constants.HAS_CONTEXT + '_' + metric_name has_no_context_metric_name = Constants.HAS_NO_CONTEXT + '_' + metric_name metric_average = \ numpy.mean(numpy.mean([k[metric_name] for k in metrics_list])) metric_stdev = numpy.std([k[metric_name] for k in metrics_list]) average_specific_metric = numpy.mean( [k[specific_metric_name] for k in metrics_list]) average_generic_metric = numpy.mean( [k[generic_metric_name] for k in metrics_list]) average_has_context_metric = numpy.mean( [k[has_context_metric_name] for k in metrics_list]) average_has_no_context_metric = numpy.mean( [k[has_no_context_metric_name] for k in metrics_list]) print('average %s:\t\t\t%f' % (metric_name, metric_average)) print('average specific %s:\t%f' % (metric_name, average_specific_metric)) print('average generic %s:\t%f' % (metric_name, average_generic_metric)) print('average has context %s:\t%f' % (metric_name, average_has_context_metric)) print('average has no context %s:\t%f' % (metric_name, average_has_no_context_metric)) print('standard deviation %s:\t%f (%f%%)' % (metric_name, metric_stdev, (metric_stdev / metric_average * 100))) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = Constants.get_properties_copy() results[metric_name] = metric_average results[specific_metric_name] = average_specific_metric results[generic_metric_name] = average_generic_metric results[has_context_metric_name] = average_has_context_metric results[has_no_context_metric_name] = average_has_no_context_metric results[metric_name + '_stdev'] = metric_stdev results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results) return results
def save_results(results): # Take the results given by the run_carskit function and extend them with # the Constants.get_properties() dictionary, then save them to a CSV file """ :type results: list[dict] :param results: """ properties = Constants.get_properties_copy() json_file = Constants.generate_file_name('carskit_results', 'json', OUTPUT_FOLDER, None, None, False) for result in results: result.update(properties) write_results_to_json(json_file, result)
def full_cycle(metric): csv_file_name = Constants.generate_file_name(metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name(metric, 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) properties = Constants.get_properties_copy() results = evaluate_topic_model(metric) print(results) results.update(properties) ETLUtils.write_row_to_csv(csv_file_name, results) ETLUtils.write_row_to_json(json_file_name, results)
def full_cycle(metric): csv_file_name = Constants.generate_file_name( metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( metric, 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) properties = Constants.get_properties_copy() results = evaluate_topic_model(metric) print(results) results.update(properties) ETLUtils.write_row_to_csv(csv_file_name, results) ETLUtils.write_row_to_json(json_file_name, results)
def save_results(results): # Take the results given by the run_carskit function and extend them with # the Constants.get_properties() dictionary, then save them to a CSV file """ :type results: list[dict] :param results: """ properties = Constants.get_properties_copy() json_file = Constants.generate_file_name( 'carskit_results', 'json', OUTPUT_FOLDER, None, None, False) for result in results: result.update(properties) write_results_to_json(json_file, result)
def tune_parameters(): # trials = Trials() from utils.constants import Constants context_name = '_context' if Constants.USE_CONTEXT else '_nocontext' cycle = '_' + str(Constants.NESTED_CROSS_VALIDATION_CYCLE) mongo_url =\ 'mongo://localhost:1234/' +\ Constants.ITEM_TYPE + context_name + '_db_nested' + cycle + '/jobs' trials = MongoTrials(mongo_url, exp_key='exp1') print('Connected to %s' % mongo_url) params = Constants.get_properties_copy() params.update({ Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE, Constants.TOPN_NUM_ITEMS_FIELD: Constants.TOPN_NUM_ITEMS, Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: Constants.NESTED_CROSS_VALIDATION_CYCLE, # 'fm_init_stdev': hp.uniform('fm_init_stdev', 0, 2), Constants.FM_ITERATIONS_FIELD: hp.quniform(Constants.FM_ITERATIONS_FIELD, 1, 500, 1), Constants.FM_NUM_FACTORS_FIELD: hp.quniform(Constants.FM_NUM_FACTORS_FIELD, 0, 200, 1), # 'fm_use_1way_interactions': hp.choice('fm_use_1way_interactions', [True, False]), # 'fm_use_bias': hp.choice('use_bias', [True, False]), # 'lda_alpha': hp.uniform('lda_alpha', 0, 1), # 'lda_beta': hp.uniform('lda_beta', 0, 2), # Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: hp.uniform( # Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD, 0, 0.5), # Constants.TOPIC_MODEL_ITERATIONS_FIELD: hp.quniform( # Constants.TOPIC_MODEL_ITERATIONS_FIELD, 50, 500, 1), # Constants.TOPIC_MODEL_PASSES_FIELD: hp.quniform( # Constants.TOPIC_MODEL_PASSES_FIELD, 1, 100, 1), # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: hp.quniform( # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD, 1, 1000, 1), # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: hp.choice( # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD, # [10, 20, 30, 50, 75, 100, 150, 300]), # Constants.TOPIC_MODEL_TYPE_FIELD: hp.choice( # Constants.TOPIC_MODEL_TYPE_FIELD, ['lda', 'mnf']), # 'topic_weighting_method': hp.choice( # 'topic_weighting_method', # ['probability', 'binary', 'all_topics']), # 'use_no_context_topics_sum': hp.choice( # 'use_no_context_topics_sum', [True, False]), Constants.USE_CONTEXT_FIELD: Constants.USE_CONTEXT }) space =\ hp.choice(Constants.USE_CONTEXT_FIELD, [ params, ]) if not Constants.USE_CONTEXT: unwanted_args = [ Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD, Constants.TOPIC_MODEL_ITERATIONS_FIELD, Constants.TOPIC_MODEL_PASSES_FIELD, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD ] for element in space.pos_args[1].named_args[:]: if element[0] in unwanted_args: space.pos_args[1].named_args.remove(element) # best = fmin( # run_recommender, space=space, algo=tpe.suggest, # max_evals=100, trials=trials) print('losses', sorted(trials.losses())) print('best', trials.best_trial['result']['loss'], trials.best_trial['misc']['vals']) print('num trials: %d' % len(trials.losses()))
def tune_parameters(): # trials = Trials() from utils.constants import Constants context_name = '_context' if Constants.USE_CONTEXT else '_nocontext' cycle = '_' + str(Constants.NESTED_CROSS_VALIDATION_CYCLE) mongo_url =\ 'mongo://localhost:1234/' +\ Constants.ITEM_TYPE + context_name + '_db_nested' + cycle + '/jobs' trials = MongoTrials(mongo_url, exp_key='exp1') print('Connected to %s' % mongo_url) params = Constants.get_properties_copy() params.update({ Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE, Constants.TOPN_NUM_ITEMS_FIELD: Constants.TOPN_NUM_ITEMS, Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: Constants.NESTED_CROSS_VALIDATION_CYCLE, # 'fm_init_stdev': hp.uniform('fm_init_stdev', 0, 2), Constants.FM_ITERATIONS_FIELD: hp.quniform( Constants.FM_ITERATIONS_FIELD, 1, 500, 1), Constants.FM_NUM_FACTORS_FIELD: hp.quniform( Constants.FM_NUM_FACTORS_FIELD, 0, 200, 1), # 'fm_use_1way_interactions': hp.choice('fm_use_1way_interactions', [True, False]), # 'fm_use_bias': hp.choice('use_bias', [True, False]), # 'lda_alpha': hp.uniform('lda_alpha', 0, 1), # 'lda_beta': hp.uniform('lda_beta', 0, 2), # Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: hp.uniform( # Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD, 0, 0.5), # Constants.TOPIC_MODEL_ITERATIONS_FIELD: hp.quniform( # Constants.TOPIC_MODEL_ITERATIONS_FIELD, 50, 500, 1), # Constants.TOPIC_MODEL_PASSES_FIELD: hp.quniform( # Constants.TOPIC_MODEL_PASSES_FIELD, 1, 100, 1), # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: hp.quniform( # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD, 1, 1000, 1), # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: hp.choice( # Constants.TOPIC_MODEL_NUM_TOPICS_FIELD, # [10, 20, 30, 50, 75, 100, 150, 300]), # Constants.TOPIC_MODEL_TYPE_FIELD: hp.choice( # Constants.TOPIC_MODEL_TYPE_FIELD, ['lda', 'mnf']), # 'topic_weighting_method': hp.choice( # 'topic_weighting_method', # ['probability', 'binary', 'all_topics']), # 'use_no_context_topics_sum': hp.choice( # 'use_no_context_topics_sum', [True, False]), Constants.USE_CONTEXT_FIELD: Constants.USE_CONTEXT }) space =\ hp.choice(Constants.USE_CONTEXT_FIELD, [ params, ]) if not Constants.USE_CONTEXT: unwanted_args = [ Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD, Constants.TOPIC_MODEL_ITERATIONS_FIELD, Constants.TOPIC_MODEL_PASSES_FIELD, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD ] for element in space.pos_args[1].named_args[:]: if element[0] in unwanted_args: space.pos_args[1].named_args.remove(element) # best = fmin( # run_recommender, space=space, algo=tpe.suggest, # max_evals=100, trials=trials) print('losses', sorted(trials.losses())) print( 'best', trials.best_trial['result']['loss'], trials.best_trial['misc']['vals']) print('num trials: %d' % len(trials.losses()))
def perform_cross_validation(self): Constants.print_properties() utilities.plant_seeds() total_recall = 0.0 total_specific_recall = 0.0 total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) self.reviews = copy.deepcopy(self.original_reviews) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.train_reviews, self.test_reviews = \ ETLUtils.split_train_test_copy( self.reviews, split=split, start=cv_start) self.export() if Constants.USE_CONTEXT: lda_based_context = self.train_word_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() self.evaluate() recall = self.top_n_evaluator.recall specific_recall = self.top_n_evaluator.specific_recall generic_recall = self.top_n_evaluator.generic_recall total_recall += recall total_specific_recall += specific_recall total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) average_recall = total_recall / total_iterations average_specific_recall = total_specific_recall / total_iterations average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average recall: %f' % average_recall) print('average specific recall: %f' % average_specific_recall) print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) results = Constants.get_properties_copy() results['recall'] = average_recall results['specific_recall'] = average_specific_recall results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") if not os.path.exists(Constants.CSV_RESULTS_FILE): with open(Constants.CSV_RESULTS_FILE, 'wb') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writeheader() w.writerow(results) else: with open(Constants.CSV_RESULTS_FILE, 'a') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writerow(results)
def main(): csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(csv_file_name) # export_lda_topics(0, 0) # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5] epsilon_list = [0.05] alpha_list = [0.0] # num_topics_list =\ # [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800] # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300] # num_topics_list = [150, 300] num_topics_list = range(1, 51) bow_type_list = ['NN'] # document_level_list = ['review', 'sentence', 1] document_level_list = [1] # topic_weighting_methods = ['binary', 'probability'] topic_weighting_methods = ['probability'] # review_type_list = ['specific', 'generic', 'all_reviews'] review_type_list = ['specific'] # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500] # lda_passes_list = [1, 10] lda_passes_list = [100] # lda_iterations_list = [50, 100, 200, 400, 800, 2000] # lda_iterations_list = [50, 100, 200, 500] lda_iterations_list = [200] # topic_model_type_list = ['lda', 'nmf'] topic_model_type_list = ['nmf'] num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\ len(document_level_list) * len(topic_weighting_methods) *\ len(review_type_list) * len(lda_passes_list) *\ len(lda_iterations_list) * len(topic_model_type_list) *\ len(bow_type_list) cycle_index = 1 for epsilon, alpha, num_topics, document_level, topic_weighting_method,\ review_type, lda_passes, lda_iterations, topic_model_type,\ bow_type in itertools.product( epsilon_list, alpha_list, num_topics_list, document_level_list, topic_weighting_methods, review_type_list, lda_passes_list, lda_iterations_list, topic_model_type_list, bow_type_list): print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.DOCUMENT_LEVEL_FIELD: document_level, Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method, Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha, Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon, Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type, Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes, Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations, Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type, Constants.BOW_TYPE_FIELD: bow_type } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def main(): csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(csv_file_name) # export_lda_topics(0, 0) # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5] epsilon_list = [0.05] alpha_list = [0.0] # num_topics_list =\ # [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800] # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300] # num_topics_list = [150, 300] num_topics_list = range(1, 51) bow_type_list = ['NN'] # document_level_list = ['review', 'sentence', 1] document_level_list = [1] # topic_weighting_methods = ['binary', 'probability'] topic_weighting_methods = ['probability'] # review_type_list = ['specific', 'generic', 'all_reviews'] review_type_list = ['specific'] # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500] # lda_passes_list = [1, 10] lda_passes_list = [100] # lda_iterations_list = [50, 100, 200, 400, 800, 2000] # lda_iterations_list = [50, 100, 200, 500] lda_iterations_list = [200] # topic_model_type_list = ['lda', 'nmf'] topic_model_type_list = ['nmf'] num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\ len(document_level_list) * len(topic_weighting_methods) *\ len(review_type_list) * len(lda_passes_list) *\ len(lda_iterations_list) * len(topic_model_type_list) *\ len(bow_type_list) cycle_index = 1 for epsilon, alpha, num_topics, document_level, topic_weighting_method,\ review_type, lda_passes, lda_iterations, topic_model_type,\ bow_type in itertools.product( epsilon_list, alpha_list, num_topics_list, document_level_list, topic_weighting_methods, review_type_list, lda_passes_list, lda_iterations_list, topic_model_type_list, bow_type_list): print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.DOCUMENT_LEVEL_FIELD: document_level, Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method, Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha, Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon, Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type, Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes, Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations, Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type, Constants.BOW_TYPE_FIELD: bow_type } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1