def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f) else: # process training JSONL file X_train, Y_labels_train = read_data_set_from_jsonl( Config.training_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_valid, Y_labels_valid = read_data_set_from_jsonl( Config.dev_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_train = X_train['b'] X_train['b_sizes'] = get_num_sents_of_bodies(b_train) for i, sample in enumerate(b_train): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_train[i] = np.asarray(sample) b_train = np.asarray(b_train) X_train['b'] = b_train logger.debug("b_train.shape: " + str(b_train.shape)) b_valid = X_valid['b'] X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid) for i, sample in enumerate(b_valid): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_valid[i] = np.asarray(sample) b_valid = np.asarray(b_valid) X_valid['b'] = b_valid logger.debug("b_valid.shape: " + str(b_valid.shape)) if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump( (X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) X_test, Y_labels_test = read_data_set_from_jsonl( Config.test_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_test = X_test['b'] X_test['b_sizes'] = get_num_sents_of_bodies(b_test) for i, sample in enumerate(b_test): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_test[i] = np.asarray(sample) b_test = np.asarray(b_test) X_test['b'] = b_test logger.debug("b_test.shape: " + str(b_test.shape)) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(X_test, restore_param_required) generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file) if Y_labels_test: print_metrics(Y_labels_test, predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'use_inter_evidence_comparison'): use_inter_evidence_comparison = Config.use_inter_evidence_comparison else: use_inter_evidence_comparison = False # 'esim_inter_evidence' model and 'esim_inter_evidence_claim_evidences_comparison' models need inter evidence inputs use_inter_evidence_comparison = use_inter_evidence_comparison or Config.estimator_name in { 'esim_inter_evidence', 'esim_inter_evidence_claim_evidences_comparison' } if hasattr(Config, 'use_claim_evidences_comparison'): use_claim_evidences_comparison = Config.use_claim_evidences_comparison else: use_claim_evidences_comparison = False # 'esim_inter_evidence_claim_evidences_comparison' model needs claim-evidence inputs use_claim_evidences_comparison = use_claim_evidences_comparison or Config.estimator_name in { 'esim_inter_evidence_claim_evidences_comparison' } if hasattr(Config, 'use_extra_features'): use_extra_features = Config.use_extra_features else: use_extra_features = False if hasattr(Config, 'use_numeric_feature'): use_numeric_feature = Config.use_numeric_feature else: use_numeric_feature = False # 'esim_num_feature' model needs numeric feature inputs use_numeric_feature = use_numeric_feature or Config.estimator_name in { 'esim_num_feature' } if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param)) logger.info("use_inter_sentence_comparison: " + str(use_inter_evidence_comparison)) logger.info("use_extra_features: " + str(use_extra_features)) logger.info("use_numeric_feature: " + str(use_numeric_feature)) logger.info("use_claim_evidences_comparison: " + str(use_claim_evidences_comparison)) if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" training_claim_features, training_evidence_features = load_feature_by_data_set( Config.training_set_file, Config.feature_path, Config.max_sentences) valid_claim_features, valid_evidence_features = load_feature_by_data_set( Config.dev_set_file, Config.feature_path, Config.max_sentences) training_set['data']['h_feats'] = training_claim_features training_set['data']['b_feats'] = training_evidence_features valid_set['data']['h_feats'] = valid_claim_features valid_set['data']['b_feats'] = valid_evidence_features if use_numeric_feature: training_num_feat = number_feature(Config.training_set_file, Config.db_path, Config.max_sentences, is_snopes) valid_num_feat = number_feature(Config.dev_set_file, Config.db_path, Config.max_sentences, is_snopes) training_set['data']['num_feat'] = training_num_feat valid_set['data']['num_feat'] = valid_num_feat if use_inter_evidence_comparison: training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices'] = training_concat_sent_indices training_set['data'][ 'b_concat_sizes'] = training_concat_sent_sizes valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices'] = valid_concat_sent_indices valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes if use_claim_evidences_comparison: training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices_for_h'] = training_all_evidences_indices training_set['data'][ 'b_concat_sizes_for_h'] = training_all_evidences_sizes valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices_for_h'] = valid_all_evidences_indices valid_set['data'][ 'b_concat_sizes_for_h'] = valid_all_evidences_sizes X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" test_claim_features, test_evidence_features = load_feature_by_data_set( Config.test_set_file, Config.feature_path, Config.max_sentences) test_set['data']['h_feats'] = test_claim_features test_set['data']['b_feats'] = test_evidence_features if use_numeric_feature: test_num_feat = number_feature(Config.test_set_file, Config.db_path, Config.max_sentences, is_snopes) test_set['data']['num_feat'] = test_num_feat x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if use_inter_evidence_comparison: test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data']['b_concat_indices'] = test_concat_sent_indices test_set['data']['b_concat_sizes'] = test_concat_sent_sizes if use_claim_evidences_comparison: test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data'][ 'b_concat_indices_for_h'] = test_all_evidences_indices test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict( x_dict, restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) logger.info("this script is only for FEVER dataset") if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) training_set['data']['scores'] = load_scores( Config.training_set_file, Config.max_sentences) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) valid_set['data']['scores'] = load_scores(Config.dev_set_file, Config.max_sentences) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) test_set['data']['scores'] = load_scores(Config.test_set_file, Config.max_sentences) x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict( x_dict, restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: dataset_list = pickle.load(f) else: # process training JSONL file training_set, _, _ = embed_data_set_for_elmo( Config.training_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_h_sent_size=Config.max_claim_size, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_tokens'] = np.expand_dims( training_set['data']['h_tokens'], 1) # training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1) valid_set, _, _ = embed_data_set_for_elmo( Config.dev_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_tokens'] = np.expand_dims( valid_set['data']['h_tokens'], 1) dataset_list = [training_set, valid_set] # save processed training data if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump(dataset_list, f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(dataset_list[0]['data'], dataset_list[0]['label'], dataset_list[1]['data'], dataset_list[1]['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) test_set, _, _ = embed_data_set_for_elmo( Config.test_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_tokens'] = np.expand_dims( test_set['data']['h_tokens'], 1) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) logger.debug("CUDA_VISIBLE_DEVICES: " + os.environ['CUDA_VISIBLE_DEVICES']) predictions = estimator.predict( test_set['data'], restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) assert hasattr( Config, 'page_source_file_path' ), "'page_source_file_path' field is needed in config file for this script" logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM credibility MTL arguments: " + str(Config.esim_credibility_mtl_hyper_param)) logger.info("this script is only for Snopes dataset") if mode == RTERunPhase.train: # training sets # @formatter:off claim_training_set, word_vocab, word_embeddings, domain_vocab, domain_embeddings, suffix_vocab, \ suffix_embeddings, protocol_vocab, protocol_embeddings, claim_stance_vocab, claim_stance_embeddings = \ embed_data_set_with_glove_with_credibility( Config.esim_credibility_mtl_hyper_param['claim_training_set'], Config.db_path, Config.page_source_file_path, glove_path=Config.glove_path, domain_embedding_size=Config.esim_credibility_mtl_hyper_param['domain_embedding_size'], suffix_embedding_size=Config.esim_credibility_mtl_hyper_param['suffix_embedding_size'], protocol_embedding_size=Config.esim_credibility_mtl_hyper_param['protocol_embedding_size'], stance_embedding_size=Config.esim_credibility_mtl_hyper_param['stance_embedding_size'], threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) # @formatter:on claim_h_sent_sizes = claim_training_set['data']['h_sent_sizes'] claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32) claim_training_set['data']['h_sent_sizes'] = np.expand_dims( claim_h_sent_sizes, 1) claim_training_set['data']['h_sizes'] = claim_h_sizes claim_training_set['data']['h_np'] = np.expand_dims( claim_training_set['data']['h_np'], 1) logger.info("size of training set: " + str(claim_training_set['data']['h_np'].shape[0])) stance_training_set, _, _, _, _ = embed_data_set_with_glove_2( Config.esim_credibility_mtl_hyper_param['stance_training_set'], Config.db_path, Config.glove_path, vocab_dict=word_vocab, glove_embeddings=word_embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=True) stance_h_sent_sizes = stance_training_set['data']['h_sent_sizes'] stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32) stance_training_set['data']['h_sent_sizes'] = np.expand_dims( stance_h_sent_sizes, 1) stance_training_set['data']['h_sizes'] = stance_h_sizes stance_training_set['data']['h_np'] = np.expand_dims( stance_training_set['data']['h_np'], 1) # valid sets claim_valid_set, _, _ = embed_data_set_with_glove_with_credibility( Config.esim_credibility_mtl_hyper_param['claim_dev_set'], Config.db_path, Config.page_source_file_path, vocab_dict=word_vocab, glove_embeddings=word_embeddings, domain_vocab=domain_vocab, domain_embeddings=domain_embeddings, suffix_vocab=suffix_vocab, suffix_embeddings=suffix_embeddings, protocol_vocab=protocol_vocab, protocol_embeddings=protocol_embeddings, stance_vocab=claim_stance_vocab, stance_embeddings=claim_stance_embeddings, domain_embedding_size=Config. esim_credibility_mtl_hyper_param['domain_embedding_size'], suffix_embedding_size=Config. esim_credibility_mtl_hyper_param['suffix_embedding_size'], protocol_embedding_size=Config. esim_credibility_mtl_hyper_param['protocol_embedding_size'], stance_embedding_size=Config. esim_credibility_mtl_hyper_param['stance_embedding_size'], threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) claim_h_sent_sizes = claim_valid_set['data']['h_sent_sizes'] claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32) claim_valid_set['data']['h_sent_sizes'] = np.expand_dims( claim_h_sent_sizes, 1) claim_valid_set['data']['h_sizes'] = claim_h_sizes claim_valid_set['data']['h_np'] = np.expand_dims( claim_valid_set['data']['h_np'], 1) logger.info("size of dev set: " + str(claim_valid_set['data']['h_np'].shape[0])) stance_valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.esim_credibility_mtl_hyper_param['stance_dev_set'], Config.db_path, Config.glove_path, vocab_dict=word_vocab, glove_embeddings=word_embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=True) stance_h_sent_sizes = stance_valid_set['data']['h_sent_sizes'] stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32) stance_valid_set['data']['h_sent_sizes'] = np.expand_dims( stance_h_sent_sizes, 1) stance_valid_set['data']['h_sizes'] = stance_h_sizes stance_valid_set['data']['h_np'] = np.expand_dims( stance_valid_set['data']['h_np'], 1) X_dict_claim = { 'train': claim_training_set['data'], 'valid': claim_valid_set['data'], } y_claim = { 'train': claim_training_set['label'], 'valid': claim_valid_set['label'] } X_dict_stance = { 'train': stance_training_set['data'], 'valid': stance_valid_set['data'], } y_stance = { 'train': stance_training_set['label'], 'valid': stance_valid_set['label'] } X_dict = { 'claim': X_dict_claim, 'stance': X_dict_stance, 'word_embedding': word_embeddings, 'domain_embedding': domain_embeddings, 'suffix_embedding': suffix_embeddings, 'protocol_embedding': protocol_embeddings, 'stance_embedding': claim_stance_embeddings } y_dict = {'claim': y_claim, 'stance': y_stance} if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_dict) save_model(estimator, Config.model_folder, Config.pickle_name, logger) dump_source_features_embeddings( Config. esim_credibility_mtl_hyper_param['features_embeddings_path'], domain_vocab, domain_embeddings, suffix_vocab, suffix_embeddings, protocol_vocab, protocol_embeddings, claim_stance_vocab, claim_stance_embeddings) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) word_vocab, word_embeddings = load_whole_glove(Config.glove_path) word_vocab = vocab_map(word_vocab) # @formatter:off domain_vocab, domain_embeddings, \ suffix_vocab, suffix_embeddings, \ protocol_vocab, protocol_embeddings, \ claim_stance_vocab, claim_stance_embeddings = load_source_features_embeddings( Config.esim_credibility_mtl_hyper_param['features_embeddings_path']) # @formatter:on test_set, _, _ = embed_data_set_with_glove_with_credibility( Config.esim_credibility_mtl_hyper_param['claim_test_set'], Config.db_path, Config.page_source_file_path, vocab_dict=word_vocab, glove_embeddings=word_embeddings, domain_vocab=domain_vocab, domain_embeddings=domain_embeddings, suffix_vocab=suffix_vocab, suffix_embeddings=suffix_embeddings, protocol_vocab=protocol_vocab, protocol_embeddings=protocol_embeddings, stance_vocab=claim_stance_vocab, stance_embeddings=claim_stance_embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_sentence_size) claim_h_sent_sizes = test_set['data']['h_sent_sizes'] claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims( claim_h_sent_sizes, 1) test_set['data']['h_sizes'] = claim_h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) logger.info("size of test set: " + str(test_set['data']['h_np'].shape[0])) x_dict = { 'X_test': test_set['data'], 'word_embedding': word_embeddings, 'domain_embedding': domain_embeddings, 'suffix_embedding': suffix_embeddings, 'protocol_embedding': protocol_embeddings, 'stance_embedding': claim_stance_embeddings } if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
Created on Fri May 17 10:56:28 2019 @author: lukasmalik """ # =========================================================================== import numpy as np import pandas as pd from nltk.tokenize import TweetTokenizer from scripts.load_data import * from scripts.load_model import * from tqdm import tqdm df = load_data() model = load_model() # loads pretrained word2vec model with 400 dimensions # =========================================================================== # calculate the document vector as average of all the words def avg_feature_vector(tweet, model, num_features,index2word_set,tokenizer): ''' calculates the average vector ''' feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 words = tokenizer.tokenize(tweet) for word in words: #print(word) # sanity check if word in index2word_set: n_words += 1 feature_vec = np.add(feature_vec, model[word]) if (n_words > 0):
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode training_set = embed_data_set_with_bert(Config.training_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_bert_np'] = np.expand_dims(training_set['data']['h_bert_np'], 1) valid_set = embed_data_set_with_bert(Config.dev_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_bert_np'] = np.expand_dims(valid_set['data']['h_bert_np'], 1) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'] } if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, training_set['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) test_set = embed_data_set_with_bert(Config.test_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes, port=Config.bert_port, port_out=Config.bert_port_out) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_bert_np'] = np.expand_dims(test_set['data']['h_bert_np'], 1) x_dict = { 'X_test': test_set['data'] } if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(x_dict, restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("BERT sentence embedding arguments: " + str(Config.bert_sent_hyper_parameter)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f) else: # process training JSONL file X_train, Y_labels_train = read_data_set_from_jsonl(Config.training_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_valid, Y_labels_valid = read_data_set_from_jsonl(Config.dev_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_train['b_sizes'] = get_num_sents_of_bodies(X_train['b']) X_valid['b_sizes'] = get_num_sents_of_bodies(X_valid['b']) b_train = X_train['b'] b_encoded_train = encode_multi_sentence_set_with_bert(b_train, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_train['b'] = b_encoded_train logger.debug("b_encoded_train.shape: " + str(b_encoded_train.shape)) h_train = X_train['h'] h_encoded_train = encode_single_sentence_set_with_bert(h_train, port=Config.bert_port, port_out=Config.bert_port_out) X_train['h'] = h_encoded_train logger.debug("h_encoded_train.shape: " + str(h_encoded_train.shape)) b_valid = X_valid['b'] b_encoded_valid = encode_multi_sentence_set_with_bert(b_valid, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_valid['b'] = b_encoded_valid logger.debug("b_encoded_valid.shape: " + str(b_encoded_valid.shape)) h_valid = X_valid['h'] h_encoded_valid = encode_single_sentence_set_with_bert(h_valid, port=Config.bert_port, port_out=Config.bert_port_out) X_valid['h'] = h_encoded_valid logger.debug("h_encoded_valid.shape: " + str(h_encoded_valid.shape)) if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) X_test, Y_labels_test = read_data_set_from_jsonl(Config.test_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_test['b_sizes'] = get_num_sents_of_bodies(X_test['b']) b_test = X_test['b'] b_encoded_test = encode_multi_sentence_set_with_bert(b_test, Config.max_sentences, port=Config.bert_port, port_out=Config.bert_port_out) X_test['b'] = b_encoded_test logger.debug("b_encoded_test.shape: " + str(b_encoded_test.shape)) h_test = X_test['h'] h_encoded_test = encode_single_sentence_set_with_bert(h_test, port=Config.bert_port, port_out=Config.bert_port_out) X_test['h'] = h_encoded_test logger.debug("h_encoded_test.shape: " + str(h_encoded_test.shape)) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(X_test, restore_param_required) generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file) if Y_labels_test is not None: print_metrics(Y_labels_test, predictions, logger) return estimator