def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) root = args.path train_pack = load_data.load_data2(root, 'train', prefix=args.dataset) valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset) predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset) # print(train_pack.left) a = train_pack["text_left"].str.lower().str.split().apply(len).max() b = valid_pack["text_left"].str.lower().str.split().apply(len).max() c = predict_pack["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack["text_right"].str.lower().str.split().apply(len).max() b = valid_pack["text_right"].str.lower().str.split().apply(len).max() c = predict_pack["text_right"].str.lower().str.split().apply(len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) t1 = time.time() # get_query_docs(train_pack) dev_queries = get_query_docs(valid_pack) test_queries = get_query_docs(predict_pack) additional_data = {} if args.reranking: predict2_hard_pack = load_data.load_data2(root, 'test2_hard', prefix=args.dataset) predict3_hard_pack = load_data.load_data2(root, 'test3_hard', prefix=args.dataset) test2_queries = get_query_docs(predict2_hard_pack) test3_queries = get_query_docs(predict3_hard_pack) additional_data[KeyWordSettings.Test2Hard] = test2_queries additional_data[KeyWordSettings.Test3Hard] = test3_queries FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) params = {"b": args.b, "k1": args.k1} """ Many other things""" FileHandler.myprint("Fitting Model") fit_model = bm25_fit.BM25Fitter(params) try: fit_model.fit(None, verbose=True, topN=args.topk, val_queries=dev_queries, test_queries=test_queries, **additional_data) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))
def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) FileHandler.myprint("Setting seed to " + str(args.seed)) seed = args.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False if args.cuda: torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) index2queries = dict( (y, x) for x, y in json.loads(open(args.query_mapped).read()).items()) index2docs = dict( (y, x) for x, y in json.loads(open(args.article_mapped).read()).items()) root = args.path use_reranking = "reranking" in root t1 = time.time() elmo_queries_path = os.path.join(args.elmo_feats, "queries_feats.pth") elmo_docs_path = os.path.join(args.elmo_feats, "articles_feats.pth") elmo_loader = load_data.ElmoLoader(elmo_queries_path, elmo_docs_path, args.fixed_length_left, args.fixed_length_right) load_data_func = elmo_loader.elmo_load_data train_pack = load_data_func(root, 'train', prefix=args.dataset) valid_pack = load_data_func(root, 'dev', prefix=args.dataset) predict_pack = load_data_func(root, 'test', prefix=args.dataset) if use_reranking: FileHandler.myprint("Using Re-Ranking Dataset..........") predict2_hard_pack = load_data_func(root, 'test2_hard', prefix=args.dataset) a = train_pack.left["text_left"].str.lower().str.split().apply(len).max() b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max() c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack.right["text_right"].str.lower().str.split().apply(len).max() b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max() c = predict_pack.right["text_right"].str.lower().str.split().apply( len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) if args.use_visual: image_loader = load_data.ImagesLoader( left_pth_file=args.left_images_features, max_num_left_images=args.n_img_in_query, right_pth_file=args.right_images_features, max_num_right_images=args.n_img_in_doc, use_cuda=args.cuda) data_packs = [train_pack, valid_pack, predict_pack] if use_reranking: data_packs.append(predict2_hard_pack) image_loader.fit(data_packs) # memory-intensive (~10Gb RAM) train_pack = image_loader.transform(train_pack) valid_pack = image_loader.transform(valid_pack) predict_pack = image_loader.transform(predict_pack) if use_reranking: predict2_hard_pack = image_loader.transform(predict2_hard_pack) print(image_loader.left_tensor.size(), image_loader.right_tensor.size()) preprocessor = mz.preprocessors.ElmoPreprocessor(args.fixed_length_left, args.fixed_length_right) print('parsing data') train_processed = preprocessor.fit_transform( train_pack) # This is a DataPack valid_processed = preprocessor.transform(valid_pack) predict_processed = preprocessor.transform(predict_pack) train_interactions = MatchInteractionVisual(train_processed) valid_interactions = MatchInteractionVisual(valid_processed) test_interactions = MatchInteractionVisual(predict_processed) if use_reranking: predict2_processed = preprocessor.transform(predict2_hard_pack) predict2_interactions = MatchInteractionVisual(predict2_processed) FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) FileHandler.myprint("Building model") print("Loading word embeddings......") t1_emb = time.time() term_index = preprocessor.context['vocab_unit'].state['term_index'] glove_embedding = mz.datasets.embeddings.load_glove_embedding( dimension=args.word_embedding_size, term_index=term_index) embedding_matrix = glove_embedding.build_matrix(term_index) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] t2_emb = time.time() print("Time to load word embeddings......", (t2_emb - t1_emb)) match_params = {} match_params['embedding'] = embedding_matrix match_params["embedding_freeze"] = True # freezing word embeddings match_params["fixed_length_left"] = args.fixed_length_left match_params["fixed_length_right"] = args.fixed_length_right match_params['dropout'] = 0.1 match_params['filters'] = args.filters match_params["conv_layers"] = args.conv_layers match_params["filters_count_pacrr"] = args.filters_count_pacrr match_params["n_s"] = args.n_s match_params["max_ngram"] = args.max_ngram match_params["head_cnn_type"] = args.head_cnn_type match_params["use_visual"] = args.use_visual match_params[ "use_average_dcompositional_att"] = args.use_average_dcompositional_att match_params["attention_type"] = args.attention_type # contextualized part match_params["left_elmo_tensor"] = elmo_loader.left_tensor_feats match_params["right_elmo_tensor"] = elmo_loader.right_tensor_feats match_params["elmo_vec_size"] = 1024 if args.use_visual: match_params["visual_feature_size"] = image_loader.visual_features_size image_loader.left_tensor = torch_utils.gpu(image_loader.left_tensor, args.cuda) image_loader.right_tensor = torch_utils.gpu(image_loader.right_tensor, args.cuda) match_params["full_left_images_tensor"] = image_loader.left_tensor match_params["full_right_images_tensor"] = image_loader.right_tensor match_model = multimodal_attention_network.MultiModalAttentionNetwork( match_params) FileHandler.myprint("Fitting Model") if args.use_visual: FileHandler.myprint("Using both Textual and Visual features.......") fit_model = fitter.VisualFitter(net=match_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, num_negative_samples=args.num_neg, logfolder=secondary_log_folder, curr_date=curr_date, use_visual=args.use_visual, image_loader=image_loader, index2queries=index2queries, index2docs=index2docs) else: FileHandler.myprint("Using Textual content only....") fit_model = contextualized_fitter.ContextualizedFitter( net=match_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, num_negative_samples=args.num_neg, logfolder=secondary_log_folder, curr_date=curr_date) try: fit_model.fit(train_interactions, verbose=True, topN=args.topk, val_interactions=valid_interactions, test_interactions=test_interactions) fit_model.load_best_model(valid_interactions, test_interactions, topN=args.topk) if use_reranking: fit_model.load_best_model_test2_test3(predict2_interactions, None, topN=args.topk) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))
def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) FileHandler.myprint("Setting seed to " + str(args.seed)) seed = args.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False if args.cuda: torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) root = args.path t1 = time.time() train_pack = load_data.load_data2(root, 'train', prefix=args.dataset) valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset) predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset) a = train_pack.left["text_left"].str.lower().str.split().apply(len).max() b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max() c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack.right["text_right"].str.lower().str.split().apply(len).max() b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max() c = predict_pack.right["text_right"].str.lower().str.split().apply( len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) preprocessor = mz.preprocessors.SplitPreprocessor(args.fixed_length_left, args.fixed_length_right, vocab_file=os.path.join( args.path, "vocab.json")) print('parsing data') train_processed = preprocessor.fit_transform( train_pack) # This is a DataPack valid_processed = preprocessor.transform(valid_pack) predict_processed = preprocessor.transform(predict_pack) train_interactions = MatchInteraction(train_processed) valid_interactions = MatchInteraction(valid_processed) test_interactions = MatchInteraction(predict_processed) FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) FileHandler.myprint("Building model") print("Loading word embeddings......") t1_emb = time.time() term_index = preprocessor.context['vocab_unit'].state['term_index'] default_embeddings = mz.datasets.embeddings.load_default_embedding( dimension=args.word_embedding_size, term_index=term_index) embedding_matrix = default_embeddings.build_matrix( term_index, initializer=lambda: np.random.normal(0, 1)) t2_emb = time.time() print("Time to load word embeddings......", (t2_emb - t1_emb)) params = dict() params['embedding'] = embedding_matrix params["embedding_freeze"] = False # trainable word embeddings params["fixed_length_left"] = args.fixed_length_left params["fixed_length_right"] = args.fixed_length_right params["embedding_output_dim"] = args.word_embedding_size params["embedding_dropout"] = args.embedding_dropout params["attention_type"] = args.attention_type params["hidden_size"] = args.hidden_size params["output_target_size"] = args.output_target_size params["bidirectional"] = False params["use_label"] = False params["use_input_feeding"] = args.use_input_feeding params["nlayers"] = 1 generative_model = fcrg_model.FCRGModel(params) FileHandler.myprint("Fitting Model") fit_model = basic_fitter.BasicFitter( net=generative_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, clip=args.clip, logfolder=secondary_log_folder, curr_date=curr_date, vocab=preprocessor.context['vocab_unit']) try: fit_model.fit(train_interactions, verbose=True, val_interactions=valid_interactions, test_interactions=test_interactions) fit_model.load_best_model(valid_interactions, test_interactions) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))