예제 #1
0
def expand_with_mpb2(seed_terms,
                     log_output_file=None,
                     bert=None,
                     bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert(
            "bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    # TODO: perhaps what we want to modify
    candidates = sence2vec_utils.get_candidates_closest_to_seed_terms(
        seed_terms, cfg.general_config['size_of_expanded'],
        cfg.MPB2_config['total_terms_to_consider'])
    # candidates = word2vec_utils.get_candidates_closest_to_seed_terms(seed_terms,
    #                                                                   cfg.general_config['size_of_expanded'],
    #                                                                   cfg.MPB2_config['total_terms_to_consider'])
    log_output_file.write("Using " + str(len(candidates)) + " candidates\n")
    if cfg.MPB2_config['assume_oracle_candidates']:
        expected_terms = utils.get_first_syn_of_terms_from_file(
            cfg.general_config['set_file'])
        utils.print_candidate_stats_to_output_file(log_output_file, candidates,
                                                   expected_terms,
                                                   len(expected_terms))
        candidates = candidates.union(expected_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(
        seed_terms, log_output_file, cfg.general_config['num_of_sentences'],
        cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'],
        bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(
        bert, bert_tokenizer, masked_sentences, seed_terms,
        cfg.general_config['num_of_indicative_patterns'], log_output_file,
        cfg.general_config['batch_size'])
    indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions]
    indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\
        (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size'])
    candidate_score_list = score_candidates(
        bert, bert_tokenizer, candidates,
        indicative_patterns_and_max_positions,
        indicative_patterns_and_top_suggestions)
    utils.print_expansion_with_scores_to_output_file(candidate_score_list,
                                                     log_output_file)
    return [i[0] for i in candidate_score_list]
예제 #2
0
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file,
                                                           cfg.general_config['num_of_sentences'],
                                                           cfg.general_config['use_indexer'],
                                                           cfg.general_config['corpus_dir'], bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences,
                                                                                seed_terms,
                                                                                cfg.general_config['num_of_indicative_patterns'],
                                                                                log_output_file,
                                                                                cfg.general_config['batch_size'])
    results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer,
                                                                             indicative_patterns_and_max_positions,
                                                                             cfg.general_config['size_of_expanded'],
                                                                             cfg.general_config['batch_size'])
    utils.print_expansion_to_output_file(results, log_output_file)
    return results
예제 #3
0

def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file,
                                                           cfg.general_config['num_of_sentences'],
                                                           cfg.general_config['use_indexer'],
                                                           cfg.general_config['corpus_dir'], bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences,
                                                                                seed_terms,
                                                                                cfg.general_config['num_of_indicative_patterns'],
                                                                                log_output_file,
                                                                                cfg.general_config['batch_size'])
    results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer,
                                                                             indicative_patterns_and_max_positions,
                                                                             cfg.general_config['size_of_expanded'],
                                                                             cfg.general_config['batch_size'])
    utils.print_expansion_to_output_file(results, log_output_file)
    return results


if __name__ == "__main__":
    seed = cfg.general_config['seed']
    output_file = utils.get_output_file(seed)
    model, tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    expanded = expand_with_mpb1(seed, output_file, model, tokenizer)
    utils.evaluate(expanded, output_file, cfg.general_config['set_file'])