def calculate_topic_stability(records): Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = [] context_extractor =\ topic_model_creator.create_topic_model(records, None, None) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = 0.8 print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS) for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1): sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return calculate_stability(all_term_rankings)
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() # self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.preprocess() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def create_topic_model(num_topics): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics }) utilities.plant_seeds() Constants.print_properties() file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \ "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS if os.path.exists(file_path): print('Ensemble topic model already exists') return # topic_ensemble_caller.run_local_parse_directory() topic_ensemble_caller.run_generate_kfold() topic_ensemble_caller.run_combine_nmf()
def create_single_topic_model(cycle_index, fold_index, check_exists=True): Constants.print_properties() print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: msg = 'This function shouldn\'t be used when the ' \ 'separate_topic_model_recsys_reviews property is set to True' raise ValueError(msg) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': pass elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(records, split, cv_start) else: raise ValueError('Unknown cross-validation strategy') utilities.plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) return create_topic_model( train_records, cycle_index, fold_index, check_exists)
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError( 'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def perform_cross_validation(self): Constants.print_properties() utilities.plant_seeds() total_recall = 0.0 total_specific_recall = 0.0 total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) self.reviews = copy.deepcopy(self.original_reviews) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.train_reviews, self.test_reviews = \ ETLUtils.split_train_test_copy( self.reviews, split=split, start=cv_start) self.export() if Constants.USE_CONTEXT: lda_based_context = self.train_word_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() self.evaluate() recall = self.top_n_evaluator.recall specific_recall = self.top_n_evaluator.specific_recall generic_recall = self.top_n_evaluator.generic_recall total_recall += recall total_specific_recall += specific_recall total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) average_recall = total_recall / total_iterations average_specific_recall = total_specific_recall / total_iterations average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average recall: %f' % average_recall) print('average specific recall: %f' % average_specific_recall) print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) results = Constants.get_properties_copy() results['recall'] = average_recall results['specific_recall'] = average_specific_recall results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") if not os.path.exists(Constants.CSV_RESULTS_FILE): with open(Constants.CSV_RESULTS_FILE, 'wb') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writeheader() w.writerow(results) else: with open(Constants.CSV_RESULTS_FILE, 'a') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writerow(results)
def perform_cross_validation(self, records): Constants.print_properties() # self.plant_seeds() metrics_list = [] total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1 / float(num_folds)) metric_name = Constants.EVALUATION_METRIC # self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j + 1), num_folds)) self.create_tmp_file_names(i, j) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(i, j) self.find_reviews_topics(context_extractor, i, j) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() metrics_list.append(metrics) print('Accumulated %s: %f' % (metric_name, numpy.mean([k[metric_name] for k in metrics_list]))) fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j + 1), fold_time)) results = self.summarize_results(metrics_list) average_cycle_time = total_cycle_time / total_iterations results['cycle_time'] = average_cycle_time print('average cycle time: %f' % average_cycle_time) write_results_to_csv(results) write_results_to_json(results) return results