def expand_with_mpb2(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert( "bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) # TODO: perhaps what we want to modify candidates = sence2vec_utils.get_candidates_closest_to_seed_terms( seed_terms, cfg.general_config['size_of_expanded'], cfg.MPB2_config['total_terms_to_consider']) # candidates = word2vec_utils.get_candidates_closest_to_seed_terms(seed_terms, # cfg.general_config['size_of_expanded'], # cfg.MPB2_config['total_terms_to_consider']) log_output_file.write("Using " + str(len(candidates)) + " candidates\n") if cfg.MPB2_config['assume_oracle_candidates']: expected_terms = utils.get_first_syn_of_terms_from_file( cfg.general_config['set_file']) utils.print_candidate_stats_to_output_file(log_output_file, candidates, expected_terms, len(expected_terms)) candidates = candidates.union(expected_terms) masked_sentences = utils.get_masked_sentences_for_seed( seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns( bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions] indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\ (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size']) candidate_score_list = score_candidates( bert, bert_tokenizer, candidates, indicative_patterns_and_max_positions, indicative_patterns_and_top_suggestions) utils.print_expansion_with_scores_to_output_file(candidate_score_list, log_output_file) return [i[0] for i in candidate_score_list]
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer, indicative_patterns_and_max_positions, cfg.general_config['size_of_expanded'], cfg.general_config['batch_size']) utils.print_expansion_to_output_file(results, log_output_file) return results
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None): if not bert or not bert_tokenizer: bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") if not log_output_file: log_output_file = utils.get_output_file(seed_terms) masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file, cfg.general_config['num_of_sentences'], cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'], bert_tokenizer) indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences, seed_terms, cfg.general_config['num_of_indicative_patterns'], log_output_file, cfg.general_config['batch_size']) results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer, indicative_patterns_and_max_positions, cfg.general_config['size_of_expanded'], cfg.general_config['batch_size']) utils.print_expansion_to_output_file(results, log_output_file) return results if __name__ == "__main__": seed = cfg.general_config['seed'] output_file = utils.get_output_file(seed) model, tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased") expanded = expand_with_mpb1(seed, output_file, model, tokenizer) utils.evaluate(expanded, output_file, cfg.general_config['set_file'])