def manual_main(): csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name( 'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( 'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def exp_ae_visual_features(): exp_name = 'ae_visual_features' out_base_dir = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_features_recon_loss_trained_on_google') exp_const = ExpConstants(exp_name, out_base_dir) exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log') exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.batch_size = 10000 exp_const.lr = 1e-2 exp_const.num_epochs = 1000 feature_dir = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_features_recon_loss_trained_on_google') data_const = VisualFeaturesDatasetConstants(feature_dir) model_const = Constants() model_const.encoder = EncoderConstants() model_const.encoder.output_dims = 300 model_const.decoder = DecoderConstants() model_const.decoder.input_dims = 300 train_ae_visual.main(exp_const, data_const, model_const)
def calculate_topic_stability(records): Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = [] context_extractor =\ topic_model_creator.create_topic_model(records, None, None) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = 0.8 print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS) for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1): sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return calculate_stability(all_term_rankings)
def exp_train(): exp_name = 'EXP_NAME' out_base_dir = os.path.join(os.getcwd(), 'symlinks/exp/EXP_GROUP') exp_const = ExpConstants(exp_name, out_base_dir) exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log') exp_const.vis_dir = os.path.join(exp_const.exp_dir, 'vis') exp_const.log_step = 10 exp_const.model_save_step = 1000 exp_const.val_step = 1000 exp_const.num_val_samples = 1000 exp_const.batch_size = 32 exp_const.num_epochs = 1000 exp_const.lr = 0.01 exp_const.momentum = 0.9 exp_const.num_workers = 5 exp_const.optimizer = 'SGD' exp_const.subset = {'training': 'train', 'validation': 'val'} data_const = DATASET_CONSTANTS() model_const = Constants() model_const.model_num = None model_const.net = NET_CONSTANTS() model_const.net_path = os.path.join(exp_const.model_dir, f'net_{model_const.model_num}') train.main(exp_const, data_const, model_const)
def count_frequencies(self): """ Counts the number of reviews each user and item have and stores the results in two separate files, one for the users and another one for the items. Note that the integer IDs are used and not the original user and item IDs """ print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S")) user_frequency_map = ETLUtils.count_frequency( self.records, Constants.USER_INTEGER_ID_FIELD) item_frequency_map = ETLUtils.count_frequency( self.records, Constants.ITEM_INTEGER_ID_FIELD) user_frequency_file = Constants.generate_file_name( 'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None, False ) item_frequency_file = Constants.generate_file_name( 'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None, False ) ETLUtils.save_json_file(user_frequency_file, [user_frequency_map]) ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
def exp_combine_glove_and_visual_features_with_ae(): exp_name = 'ae_glove_and_visual' out_base_dir = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google') exp_const = ExpConstants(exp_name, out_base_dir) exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log') exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.batch_size = 10000 exp_const.lr = 1e-2 exp_const.num_epochs = 1000 concat_embeddings_dir = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \ 'concat_glove_and_visual') data_const = ConcatEmbedDatasetConstants(concat_embeddings_dir) data_const.embeddings_h5py = os.path.join(data_const.concat_dir, 'subset_visual_word_vecs.h5py') data_const.word_to_idx_json = os.path.join( data_const.concat_dir, 'subset_visual_word_vecs_idx.json') model_const = Constants() model_const.encoder = EncoderConstants() model_const.decoder = DecoderConstants() train_ae.main(exp_const, data_const, model_const)
def create_topic_models(): my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) reviews_preprocessor = ReviewsPreprocessor(use_cache=True) reviews_preprocessor.full_cycle()
def full_cycle(): num_topics_list = [5, 10, 20, 40] # bow_type_list = [None, 'NN', 'JJ', 'VB'] review_type_list = ['specific', 'generic'] # num_topics_list = [10] bow_type_list = ['NN'] results = [] for num_topics, bow_type, review_type in itertools.product( num_topics_list, bow_type_list, review_type_list): Constants.update_properties({ Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) result = analyze_topics() result.update({ Constants.BOW_TYPE_FIELD: bow_type, Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type }) results.append(result) for result in results: print(result) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \ '_topic_model_context_richness' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys())
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from evaluation.context_top_n_runner import ContextTopNRunner print('\n\n************************\n************************\n') print('args', args) # Cast integer values args[Constants.FM_ITERATIONS_FIELD] = \ int(args[Constants.FM_ITERATIONS_FIELD]) args[Constants.FM_NUM_FACTORS_FIELD] = \ int(args[Constants.FM_NUM_FACTORS_FIELD]) if args[Constants.USE_CONTEXT_FIELD]: args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \ int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]) args[Constants.TOPIC_MODEL_PASSES_FIELD] = \ int(args[Constants.TOPIC_MODEL_PASSES_FIELD]) args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \ int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]) Constants.update_properties(args) # Finish updating parameters my_context_top_n_runner = ContextTopNRunner() results = my_context_top_n_runner.run() results['loss'] = -results[Constants.EVALUATION_METRIC] results['status'] = 'ok' print('loss', results['loss']) return results
def main(**kwargs): exp_base_dir = coco_paths['exp_dir'] if kwargs['dataset'] == 'flickr': exp_base_dir = flickr_paths['exp_dir'] exp_const = ExpConstants(kwargs['exp_name'], exp_base_dir) exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.seed = 0 exp_const.contextualize = not kwargs['no_context'] exp_const.random_lang = kwargs['random_lang'] data_const = FlickrDatasetConstants(kwargs['subset']) model_const = Constants() model_const.model_num = kwargs['model_num'] model_const.object_encoder = ObjectEncoderConstants() model_const.object_encoder.context_layer.output_attentions = True model_const.object_encoder.object_feature_dim = 2048 model_const.cap_encoder = CapEncoderConstants() model_const.cap_encoder.output_attentions = True model_const.cap_info_nce_layers = kwargs['cap_info_nce_layers'] if model_const.model_num == -100: filename = os.path.join(exp_const.exp_dir, f'results_val_best.json') results = io.load_json_object(filename) model_const.model_num = results['model_num'] print('Selected model num:', model_const.model_num) model_const.object_encoder_path = os.path.join( exp_const.model_dir, f'object_encoder_{model_const.model_num}') model_const.lang_sup_criterion_path = os.path.join( exp_const.model_dir, f'lang_sup_criterion_{model_const.model_num}') if exp_const.random_lang is True: model_const.cap_encoder_path = os.path.join( exp_const.model_dir, f'cap_encoder_{model_const.model_num}') eval_flickr_phrase_loc.main(exp_const, data_const, model_const)
def cli_main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=True)) csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results)
def manual_main(): csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name('topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS] num_cycles = len(num_topics_list) cycle_index = 1 for num_topics in num_topics_list: print('\ncycle_index: %d/%d' % (cycle_index, num_cycles)) new_dict = { Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics, } print(new_dict) Constants.update_properties(new_dict) results = Constants.get_properties_copy() results.update(analyze_topics(include_stability=False)) write_results_to_csv(csv_file_name, results) write_results_to_json(json_file_name, results) cycle_index += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-c', '--cycle', metavar='int', type=int, nargs=1, help='The index of the running cycle') parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() fold = args.fold[0] if args.fold is not None else None cycle = args.cycle[0] if args.cycle is not None else None num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if fold is None and cycle is None: records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) create_topic_model(records, None, None) else: create_single_topic_model(cycle, fold)
def preprocess_data(): my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) reviews_preprocessor = ReviewsPreprocessor(use_cache=True) reviews_preprocessor.full_cycle()
def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def _exp_top_boxes_per_hoi(out_base_dir, data_const): args = parser.parse_args() not_specified_args = manage_required_args( args, parser, required_args=['model_num'], optional_args=[ 'verb_given_appearance', 'verb_given_human_appearance', 'verb_given_object_appearance', 'verb_given_boxes_and_object_label', 'verb_given_human_pose', 'rcnn_det_prob']) exp_name = 'factors' if args.rcnn_det_prob: exp_name += '_rcnn_det_prob' if args.verb_given_appearance: exp_name += '_appearance' if args.verb_given_human_appearance: exp_name += '_human_appearance' if args.verb_given_object_appearance: exp_name += '_object_appearance' if args.verb_given_boxes_and_object_label: exp_name += '_boxes_and_object_label' if args.verb_given_human_pose: exp_name += '_human_pose' exp_const = ExpConstants( exp_name=exp_name, out_base_dir=out_base_dir) exp_const.model_dir = os.path.join(exp_const.exp_dir,'models') exp_const.num_to_vis = 10 data_const.pred_hoi_dets_h5py = os.path.join( exp_const.exp_dir, f'pred_hoi_dets_test_{args.model_num}.hdf5') hoi_cand_dir = os.path.join( os.getcwd(), 'data_symlinks/hico_exp/hoi_candidates') data_const.human_pose_feats_hdf5 = os.path.join( hoi_cand_dir, 'human_pose_feats_test.hdf5') data_const.num_pose_keypoints = 18 model_const = Constants() model_const.model_num = args.model_num model_const.hoi_classifier = HoiClassifierConstants() model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob model_const.hoi_classifier.model_pth = os.path.join( exp_const.model_dir, f'hoi_classifier_{model_const.model_num}') vis_top_boxes_per_hoi.main(exp_const, data_const, model_const)
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() # self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def cycle_eval_topic_model(metric, num_topics_list): csv_file_name = Constants.generate_file_name( metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) for topic in num_topics_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic}) results = run_eval_topic_model(metric) topic_model_analyzer.write_results_to_csv(csv_file_name, results)
def cycle_eval_topic_model(metric, num_topics_list): csv_file_name = Constants.generate_file_name(metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) for topic in num_topics_list: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic}) results = run_eval_topic_model(metric) topic_model_analyzer.write_results_to_csv(csv_file_name, results)
def main(**kwargs): exp_base_dir = coco_paths['exp_dir'] if kwargs['dataset'] == 'flickr': exp_base_dir = flickr_paths['exp_dir'] exp_const = ExpConstants(kwargs['exp_name'], exp_base_dir) exp_const.log_dir = os.path.join(exp_const.exp_dir, 'logs') exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.vis_dir = os.path.join(exp_const.exp_dir, 'vis') exp_const.dataset = kwargs['dataset'] exp_const.optimizer = 'Adam' exp_const.lr = kwargs['lr'] exp_const.momentum = None exp_const.num_epochs = 10 exp_const.log_step = 20 # Save models approx. twice every epoch exp_const.model_save_step = 400000 // (2 * kwargs['train_batch_size'] ) # 4000=400000/(2*50) if exp_const.dataset == 'flickr': exp_const.model_save_step = 150000 // (2 * kwargs['train_batch_size']) val_freq_factor = 2 if kwargs['val_frequently'] is True: val_freq_factor = 1 exp_const.val_step = val_freq_factor * exp_const.model_save_step # set to 1*model_save_step for plotting mi vs perf exp_const.num_val_samples = None exp_const.train_batch_size = kwargs['train_batch_size'] exp_const.val_batch_size = 20 exp_const.num_workers = 10 exp_const.seed = 0 exp_const.neg_noun_loss_wt = kwargs['neg_noun_loss_wt'] exp_const.self_sup_loss_wt = kwargs['self_sup_loss_wt'] exp_const.lang_sup_loss_wt = kwargs['lang_sup_loss_wt'] exp_const.contextualize = not kwargs['no_context'] exp_const.random_lang = kwargs['random_lang'] DatasetConstants = CocoDatasetConstants if exp_const.dataset == 'flickr': DatasetConstants = FlickrDatasetConstants data_const = { 'train': DatasetConstants('train'), 'val': DatasetConstants('val'), } model_const = Constants() model_const.model_num = kwargs['model_num'] model_const.object_encoder = ObjectEncoderConstants() model_const.object_encoder.context_layer.output_attentions = True model_const.object_encoder.object_feature_dim = 2048 model_const.cap_encoder = CapEncoderConstants() model_const.cap_encoder.output_attentions = True model_const.cap_info_nce_layers = kwargs['cap_info_nce_layers'] model_const.object_encoder_path = os.path.join( exp_const.model_dir, f'object_encoder_{model_const.model_num}') model_const.self_sup_criterion_path = os.path.join( exp_const.model_dir, f'self_sup_criterion_{model_const.model_num}') model_const.lang_sup_criterion_path = os.path.join( exp_const.model_dir, f'lang_sup_criterion_{model_const.model_num}') train(exp_const, data_const, model_const)
def _exp_eval(out_base_dir, data_const): args = parser.parse_args() not_specified_args = manage_required_args( args, parser, required_args=['model_num'], optional_args=[ 'verb_given_appearance', 'verb_given_human_appearance', 'verb_given_object_appearance', 'verb_given_boxes_and_object_label', 'verb_given_human_pose', 'rcnn_det_prob']) exp_name = 'factors' if args.rcnn_det_prob: exp_name += '_rcnn_det_prob' if args.verb_given_appearance: exp_name += '_appearance' if args.verb_given_human_appearance: exp_name += '_human_appearance' if args.verb_given_object_appearance: exp_name += '_object_appearance' if args.verb_given_boxes_and_object_label: exp_name += '_boxes_and_object_label' if args.verb_given_human_pose: exp_name += '_human_pose' exp_const = ExpConstants( exp_name=exp_name, out_base_dir=out_base_dir) exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') data_const.balanced_sampling = False model_const = Constants() model_const.model_num = args.model_num model_const.hoi_classifier = HoiClassifierConstants() model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance model_const.hoi_classifier.verb_given_human_appearance = args.verb_given_human_appearance model_const.hoi_classifier.verb_given_object_appearance = args.verb_given_object_appearance model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob model_const.hoi_classifier.model_pth = os.path.join( exp_const.model_dir, f'hoi_classifier_{model_const.model_num}') if isinstance(data_const, FeatureConstantsVcoco): data_sign = 'vcoco' else: data_sign = 'hico' evaluate.main(exp_const, data_const, model_const, data_sign)
def exp_concat_random_with_glove(): exp_name = 'concat_with_glove_100' # alt. xformed_ out_base_dir = os.path.join( os.getcwd(), 'symlinks/exp/multi_sense_cooccur/linear_100') exp_const = ExpConstants(exp_name,out_base_dir) exp_const.random_dim = 100 data_const = Constants() glove_const = GloveConstantsFactory.create(dim='100') data_const.glove_idx = glove_const.word_to_idx_json data_const.glove_h5py = glove_const.embeddings_h5py concat_random_with_glove.main(exp_const,data_const)
def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics
def exp_extract_embeddings(): args = parser.parse_args() not_specified_args = manage_required_args( args, parser, required_args=[ 'embed_dim', 'xform', 'model_num', 'syn']) exp_name = f'{args.xform}_{args.embed_dim}' out_base_dir = os.path.join( os.getcwd(), 'symlinks/exp/multi_sense_cooccur') exp_const = ExpConstants(exp_name,out_base_dir) exp_const.model_dir = os.path.join(exp_const.exp_dir,'models') exp_const.cooccur_types = [ 'syn', 'attr_attr', 'obj_attr', 'obj_hyp', 'context' ] if args.syn==False: exp_const.cooccur_types = exp_const.cooccur_types[1:] data_const = MultiSenseCooccurDatasetConstants() data_const.cooccur_csv = os.path.join( os.getcwd(), 'symlinks/exp/multi_sense_cooccur/cooccurrences/merged_cooccur.csv') model_const = Constants() model_const.model_num = args.model_num model_const.net = LogBilinearConstants() model_const.net.num_words = 93553 model_const.net.embed_dims = args.embed_dim model_const.net.two_embedding_layers = False model_const.net.xform_type = args.xform model_const.net.xform_num_layers = None model_const.net.use_bias = True model_const.net.use_fx = False model_const.net.cooccur_types = copy.deepcopy(exp_const.cooccur_types) model_const.net_path = os.path.join( exp_const.model_dir, f'net_{model_const.model_num}') extract_embeddings.main(exp_const,data_const,model_const) extract_embeddings_xformed.main(exp_const,data_const,model_const)
def _exp_train(out_base_dir, data_const_train, data_const_val, data_sign='hico'): args = parser.parse_args() not_specified_args = manage_required_args( args, parser, required_args=['imgs_per_batch', 'fp_to_tp_ratio'], optional_args=[ 'verb_given_appearance', 'verb_given_human_appearance', 'verb_given_object_appearance', 'verb_given_boxes_and_object_label', 'verb_given_human_pose', 'rcnn_det_prob' ]) exp_name = 'factors' if args.rcnn_det_prob: exp_name += '_rcnn_det_prob' if args.verb_given_appearance: exp_name += '_appearance' if args.verb_given_human_appearance: exp_name += '_human_appearance' if args.verb_given_object_appearance: exp_name += '_object_appearance' if args.verb_given_boxes_and_object_label: exp_name += '_boxes_and_object_label' if args.verb_given_human_pose: pose = '_human_pose' exp_name += pose exp_const = ExpConstants(exp_name=exp_name, out_base_dir=out_base_dir) exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log') exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models') exp_const.num_epochs = 10 exp_const.imgs_per_batch = args.imgs_per_batch exp_const.lr = 1e-3 model_const = Constants() model_const.hoi_classifier = HoiClassifierConstants(data_sign) model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance model_const.hoi_classifier.verb_given_human_appearance = args.verb_given_human_appearance model_const.hoi_classifier.verb_given_object_appearance = args.verb_given_object_appearance model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob train.main(exp_const, data_const_train, data_const_val, model_const, data_sign)
def run_tests(): combined_parameters = parameter_combinator.get_combined_parameters() test_cycle = 1 num_tests = len(combined_parameters) for properties in combined_parameters: Constants.update_properties(properties) context_top_n_runner = WordContextTopNRunner() print('\n\n******************\nTest %d/%d\n******************\n' % (test_cycle, num_tests)) context_top_n_runner.perform_cross_validation() test_cycle += 1
def run_tests(): combined_parameters = parameter_combinator.hotel_context_parameters() test_cycle = 1 num_tests = len(combined_parameters) for properties in combined_parameters: Constants.update_properties(properties) context_top_n_runner = ContextTopNRunner() print('\n\n******************\nTest %d/%d\n******************\n' % (test_cycle, num_tests)) context_top_n_runner.perform_cross_validation() test_cycle += 1
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.preprocess() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def evaluate(self, settings, time_file=None): constants = Constants() if not settings.random_agent: net, tmp_net, init = self.get_model( settings ) # Initialize various constants for 3D reconstruction. saver = tf.train.Saver() # Tensorboard saver. #saver = tf.train.Saver() sess = None if not settings.random_agent: config = tf.ConfigProto() sess = tf.Session(config=config) if settings.random_agent: agent = RandomAgent() else: sess.run(init) self.load_net_weights(saver, sess, settings, latest=False) sess.graph.finalize() agent = NetAgent(net, None) net.set_session(sess) if settings.carla: self.evaluate_method_carla(agent, constants, sess, settings) else: self.evaluate_methods_cs(agent, constants, sess, settings)
def load_pipeline(): best_hyperparams_file_name = Constants.generate_file_name( 'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None, False) if not os.path.exists(best_hyperparams_file_name): print('Recsys contextual records have already been generated') full_cycle() with open(best_hyperparams_file_name, 'r') as json_file: file_contents = json_file.read() parameters = json.loads(file_contents) print(parameters) classifiers = { 'logisticregression': LogisticRegression(), 'svc': SVC(), 'kneighborsclassifier': KNeighborsClassifier(), 'decisiontreeclassifier': DecisionTreeClassifier(), 'nusvc': NuSVC(), 'randomforestclassifier': RandomForestClassifier() } classifier = classifiers[parameters['classifier'].lower()] # print(classifier) classifier_params = get_classifier_params(parameters) classifier.set_params(**classifier_params) print(classifier) resampler = sampler_factory.create_sampler( parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED) return Pipeline([('resampler', resampler), ('classifier', classifier)])
def export_records(self): print('%s: exporting transformed records' % time.strftime("%Y/%m/%d-%H:%M:%S")) records_to_export = [] desired_fields = [ Constants.USER_INTEGER_ID_FIELD, Constants.ITEM_INTEGER_ID_FIELD, Constants.RATING_FIELD, Constants.CONTEXT_FIELD, ] for record in self.records: new_record = {field: record[field] for field in desired_fields} records_to_export.append(new_record) file_name = Constants.generate_file_name( 'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER, None, None, True, True, uses_carskit=False, normalize_topics=True, format_context=True) ETLUtils.save_json_file(file_name, records_to_export)
def full_cycle(metric): csv_file_name = Constants.generate_file_name(metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name(metric, 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) properties = Constants.get_properties_copy() results = evaluate_topic_model(metric) print(results) results.update(properties) ETLUtils.write_row_to_csv(csv_file_name, results) ETLUtils.write_row_to_json(json_file_name, results)
def get_topic_model_prefix(folder='', seed=None): prefix = 'topic_model' if seed is not None: prefix += '_seed-' + str(seed) return Constants.generate_file_name( prefix, '', folder, None, None, True, True)[:-1]
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') args = parser.parse_args() fold = args.fold[0] new_properties = { Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold, Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate' } Constants.update_properties(new_properties) context_top_n_runner.run_tests()
def get_topic_model_prefix(folder='', seed=None): prefix = 'topic_model' if seed is not None: prefix += '_seed-' + str(seed) return Constants.generate_file_name(prefix, '', folder, None, None, True, True)[:-1]
def save_results(results): # Take the results given by the run_carskit function and extend them with # the Constants.get_properties() dictionary, then save them to a CSV file """ :type results: list[dict] :param results: """ properties = Constants.get_properties_copy() json_file = Constants.generate_file_name('carskit_results', 'json', OUTPUT_FOLDER, None, None, False) for result in results: result.update(properties) write_results_to_json(json_file, result)
def full_cycle(metric): csv_file_name = Constants.generate_file_name( metric, 'csv', Constants.RESULTS_FOLDER, None, None, False) json_file_name = Constants.generate_file_name( metric, 'json', Constants.RESULTS_FOLDER, None, None, False) print(json_file_name) print(csv_file_name) properties = Constants.get_properties_copy() results = evaluate_topic_model(metric) print(results) results.update(properties) ETLUtils.write_row_to_csv(csv_file_name, results) ETLUtils.write_row_to_json(json_file_name, results)
def topic_stability_main(): records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # num_topic_list = range(2, 101) num_topic_list = [2, 5] results = {} for num_topics in num_topic_list: new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics} Constants.update_properties(new_properties) results[num_topics] = calculate_topic_stability(records) print('Results:') for num_topics in num_topic_list: scores = results[num_topics] print('%d: %.4f [%.4f,%.4f]' % (num_topics, numpy.nanmean(scores), numpy.nanmin(scores), numpy.nanmax(scores)))
def save_results(results): # Take the results given by the run_carskit function and extend them with # the Constants.get_properties() dictionary, then save them to a CSV file """ :type results: list[dict] :param results: """ properties = Constants.get_properties_copy() json_file = Constants.generate_file_name( 'carskit_results', 'json', OUTPUT_FOLDER, None, None, False) for result in results: result.update(properties) write_results_to_json(json_file, result)
def load_topic_model(cycle_index, fold_index): file_path = \ Constants.generate_file_name( 'topic_model', 'pkl', Constants.CACHE_FOLDER, cycle_index, fold_index, True) print(file_path) with open(file_path, 'rb') as read_file: topic_model = pickle.load(read_file) return topic_model
def create_all_term_rankings(records, metric): print('%s: creating all term rankings' % time.strftime("%Y/%m/%d-%H:%M:%S")) all_term_rankings = [] # context_extractor =\ # topic_model_creator.create_topic_model(records, None, None) # terms_matrix = get_topic_model_terms( # context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) # all_term_rankings.append(terms_matrix) context_extractor = \ topic_model_creator.train_context_extractor(records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]: sample_ratio = None Constants.update_properties( {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio}) msg = 'Warning: Since the metric is \'%s\' I have updated the ' \ 'topic_model_stability_sample_ratio value to None' % metric print(msg) num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS for i in range(num_iterations - 1): print('Iteration %d/%d' % (i+1, num_iterations)) print('sample_ratio:', sample_ratio) if sample_ratio is None: sampled_records = records else: sampled_records = sample_list(records, sample_ratio) context_extractor = \ topic_model_creator.train_context_extractor(sampled_records, False) terms_matrix = get_topic_model_terms( context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS) all_term_rankings.append(terms_matrix) return all_term_rankings
def test(): document_term_matrix = NmfTopicExtractor.load_document_term_matrix() results = [] # my_list = range(2, 31) my_list = range(2, 61) for i in my_list: Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) topic_model = NmfTopicExtractor() topic_model.load_trained_data() document_topic_matrix = topic_model.document_topic_matrix topic_term_matrix = topic_model.topic_term_matrix divergence = calculate_divergence( document_term_matrix, document_topic_matrix, topic_term_matrix) result = { 'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS, 'divergence': divergence, Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble', Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE } results.append(result) print('Num topics: %d, Divergence: %f' % (Constants.TOPIC_MODEL_NUM_TOPICS, divergence)) for result in results: print('%d %f' % (result['num_topics'], result['divergence'])) prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\ '_topic_model_divergence' csv_file_path = prefix + '.csv' json_file_path = prefix + '.json' headers = sorted(results[0].keys()) ETLUtils.save_csv_file(csv_file_path, results, headers) ETLUtils.save_json_file(json_file_path, results)
def run_eval_topic_model(metric): parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \ 'eval-' + metric.replace('_', '-') + '.py' csv_file = Constants.generate_file_name( metric, 'csv', BASE_FOLDER, None, None, True, True) dataset_file_name = Constants.generate_file_name( 'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\ '/ranks*.pkl' topic_model_files = glob.glob(dataset_file_name) command = [ PYTHON_COMMAND, parse_directory_command, ] command.extend(topic_model_files) command.extend([ '-o', csv_file ]) print(' '.join(command)) unique_id = uuid.uuid4().hex log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \ Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\ unique_id + '.log' # log_file = open(log_file_name, "w") p = subprocess.Popen( command, stdout=log_file, cwd=Constants.TOPIC_ENSEMBLE_FOLDER) p.wait() results = read_csv_first_column_as_key(csv_file, metric) results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\ Constants.TOPIC_MODEL_NUM_TOPICS results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE return results
def load_document_term_matrix(): topic_model_corpus_folder = \ Constants.CACHE_FOLDER + 'topic_models/corpus/' corpus_path = Constants.generate_file_name( 'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None, False)[:-1] + '.pkl' document_term_matrix, _, _, _ = load_corpus(corpus_path) print("Loaded document-term matrix of size %s" % str(document_term_matrix.shape)) return document_term_matrix
def create_topic_model(num_topics): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10, Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics }) utilities.plant_seeds() Constants.print_properties() file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \ "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS if os.path.exists(file_path): print('Ensemble topic model already exists') return # topic_ensemble_caller.run_local_parse_directory() topic_ensemble_caller.run_generate_kfold() topic_ensemble_caller.run_combine_nmf()
def create_single_topic_model(cycle_index, fold_index, check_exists=True): Constants.print_properties() print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: msg = 'This function shouldn\'t be used when the ' \ 'separate_topic_model_recsys_reviews property is set to True' raise ValueError(msg) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': pass elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(records, split, cv_start) else: raise ValueError('Unknown cross-validation strategy') utilities.plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) return create_topic_model( train_records, cycle_index, fold_index, check_exists)
def create_topic_model_with_context_records(): processed_records_file = Constants.generate_file_name( 'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None, None, False, True) records = ETLUtils.load_json_file(processed_records_file) print('records length: %d' % len(records)) context_records = ETLUtils.filter_records(records, 'context_type', ['context']) print('context records length: %d' % len(context_records)) context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific']) print('context specific records length: %d' % len(context_specific_records)) for i in range(len(context_specific_records)): # print('%d:\t%s' % (i, context_records[i]['text'])) print('%d:\t%s' % (i, context_specific_records[i]['bow'])) for i in range(1, len(context_records)+1): Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) context_extractor = \ topic_model_creator.create_topic_model(records, None, None) topic_data = [] for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS): result = {} result['topic_id'] = topic result.update(split_topic(context_extractor.print_topic_model( num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) file_name = Constants.generate_file_name( 'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True) generate_excel_file(topic_data, file_name)
def analyze_results(): json_file = Constants.generate_file_name( 'carskit_results', 'json', OUTPUT_FOLDER, None, None, False) records = ETLUtils.load_json_file(json_file) data_frame = pandas.DataFrame(records) print(sorted(list(data_frame.columns.values))) cols = [ 'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format', 'topic_model_num_topics', 'topic_model_normalize'] data_frame = data_frame[cols] data_frame = data_frame.sort_values(['ck_rec10']) print(data_frame) data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError( 'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def run_recommender(args): import sys # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python') sys.path.append('/home/fpena/yelp/source/python') from utils.constants import Constants from topicmodeling.context import topic_model_analyzer print('\n\n************************\n************************\n') print('args', args) parameters = { Constants.BUSINESS_TYPE_FIELD: args[Constants.BUSINESS_TYPE_FIELD], # 'lda_alpha': args['lda_alpha'], # 'lda_beta': args['lda_beta'], Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD], Constants.TOPIC_MODEL_ITERATIONS_FIELD: int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]), Constants.TOPIC_MODEL_PASSES_FIELD: int(args[Constants.TOPIC_MODEL_PASSES_FIELD]), Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]), # 'topic_weighting_method': args['topic_weighting_method'], Constants.USE_CONTEXT_FIELD: args[Constants.USE_CONTEXT_FIELD] } Constants.update_properties(parameters) # Finish updating parameters results = topic_model_analyzer.export_topics() results['loss'] = -results['combined_score'] results['status'] = 'ok' print('loss', results['loss']) return results
def get_topic_ensemble_ranks_file_paths(): num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS random_seeds = range(1, num_models + 1) suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS file_paths = [] for seed in random_seeds: prefix = 'topic_model_seed-' + str(seed) topic_model_folder = Constants.generate_file_name( prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1] topic_model_file = topic_model_folder + '/' + suffix file_paths.append(topic_model_file) return file_paths
def create_topic_model(records, cycle_index, fold_index, check_exists=True): print('%s: Create topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) topic_model_file_path = \ Constants.generate_file_name( 'topic_model', 'pkl', Constants.CACHE_FOLDER, cycle_index, fold_index, True) print(topic_model_file_path) if check_exists and os.path.exists(topic_model_file_path): print('WARNING: Topic model already exists') return load_topic_model(cycle_index, fold_index) topic_model = train_context_extractor(records) with open(topic_model_file_path, 'wb') as write_file: pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL) return topic_model