Exemplo n.º 1
0
def process_file(solve, input_file):
    print("processing %s" % (input_file, ))
    output_file = get_output_file(input_file)

    world = parse(input_file=os.path.join('./input_files', input_file))
    analyze_world(world)
    t0 = time.time()
    solution = solve(world)
    t1 = time.time()
    print("solution took %.1f sec" % (t1 - t0, ))
    score = calculate_score(world, solution)
    t2 = time.time()
    print("calculate score took %.1f sec" % (t2 - t1, ))
    print("SCORE: %d" % score)
    write(solution, output_file)
    return score
Exemplo n.º 2
0
def run_expander_experiment(exp_name, bert, bert_tokenizer):
    seed = cfg.general_config['seed']
    output_file = utils.get_output_file(seed, exp_name)
    if exp_name == "MPB1" or exp_name.startswith(
            "sent_num") or exp_name == "BB":
        expanded = MPB1.expand_with_mpb1(seed, output_file, bert,
                                         bert_tokenizer)
    elif exp_name == "MPB2" or exp_name.startswith("sim_param"):
        expanded = MPB2.expand_with_mpb2(seed, output_file, bert,
                                         bert_tokenizer)
    else:
        print("wrong exp name.")
        return -1
    map_score = utils.evaluate(expanded, output_file,
                               cfg.general_config['set_file'])
    return map_score
Exemplo n.º 3
0
def expand_with_mpb2(seed_terms,
                     log_output_file=None,
                     bert=None,
                     bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert(
            "bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    # TODO: perhaps what we want to modify
    candidates = sence2vec_utils.get_candidates_closest_to_seed_terms(
        seed_terms, cfg.general_config['size_of_expanded'],
        cfg.MPB2_config['total_terms_to_consider'])
    # candidates = word2vec_utils.get_candidates_closest_to_seed_terms(seed_terms,
    #                                                                   cfg.general_config['size_of_expanded'],
    #                                                                   cfg.MPB2_config['total_terms_to_consider'])
    log_output_file.write("Using " + str(len(candidates)) + " candidates\n")
    if cfg.MPB2_config['assume_oracle_candidates']:
        expected_terms = utils.get_first_syn_of_terms_from_file(
            cfg.general_config['set_file'])
        utils.print_candidate_stats_to_output_file(log_output_file, candidates,
                                                   expected_terms,
                                                   len(expected_terms))
        candidates = candidates.union(expected_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(
        seed_terms, log_output_file, cfg.general_config['num_of_sentences'],
        cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'],
        bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(
        bert, bert_tokenizer, masked_sentences, seed_terms,
        cfg.general_config['num_of_indicative_patterns'], log_output_file,
        cfg.general_config['batch_size'])
    indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions]
    indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\
        (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size'])
    candidate_score_list = score_candidates(
        bert, bert_tokenizer, candidates,
        indicative_patterns_and_max_positions,
        indicative_patterns_and_top_suggestions)
    utils.print_expansion_with_scores_to_output_file(candidate_score_list,
                                                     log_output_file)
    return [i[0] for i in candidate_score_list]
Exemplo n.º 4
0
def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file,
                                                           cfg.general_config['num_of_sentences'],
                                                           cfg.general_config['use_indexer'],
                                                           cfg.general_config['corpus_dir'], bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences,
                                                                                seed_terms,
                                                                                cfg.general_config['num_of_indicative_patterns'],
                                                                                log_output_file,
                                                                                cfg.general_config['batch_size'])
    results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer,
                                                                             indicative_patterns_and_max_positions,
                                                                             cfg.general_config['size_of_expanded'],
                                                                             cfg.general_config['batch_size'])
    utils.print_expansion_to_output_file(results, log_output_file)
    return results
Exemplo n.º 5
0

def expand_with_mpb1(seed_terms, log_output_file=None, bert=None, bert_tokenizer=None):
    if not bert or not bert_tokenizer:
        bert, bert_tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    if not log_output_file:
        log_output_file = utils.get_output_file(seed_terms)
    masked_sentences = utils.get_masked_sentences_for_seed(seed_terms, log_output_file,
                                                           cfg.general_config['num_of_sentences'],
                                                           cfg.general_config['use_indexer'],
                                                           cfg.general_config['corpus_dir'], bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(bert, bert_tokenizer, masked_sentences,
                                                                                seed_terms,
                                                                                cfg.general_config['num_of_indicative_patterns'],
                                                                                log_output_file,
                                                                                cfg.general_config['batch_size'])
    results = model_utils.get_models_top_k_suggestions_for_group_of_patterns(bert, bert_tokenizer,
                                                                             indicative_patterns_and_max_positions,
                                                                             cfg.general_config['size_of_expanded'],
                                                                             cfg.general_config['batch_size'])
    utils.print_expansion_to_output_file(results, log_output_file)
    return results


if __name__ == "__main__":
    seed = cfg.general_config['seed']
    output_file = utils.get_output_file(seed)
    model, tokenizer = model_utils.get_model_and_tokenizer_bert("bert-large-uncased")
    expanded = expand_with_mpb1(seed, output_file, model, tokenizer)
    utils.evaluate(expanded, output_file, cfg.general_config['set_file'])
Exemplo n.º 6
0
        cfg.general_config['use_indexer'], cfg.general_config['corpus_dir'],
        bert_tokenizer)
    indicative_patterns_and_max_positions = model_utils.get_indicative_patterns(
        bert, bert_tokenizer, masked_sentences, seed_terms,
        cfg.general_config['num_of_indicative_patterns'], log_output_file,
        cfg.general_config['batch_size'])
    indicative_patterns = [i[0] for i in indicative_patterns_and_max_positions]
    indicative_patterns_and_top_suggestions = model_utils.get_patterns_top_k_model_suggestions_for_each_pattern\
        (bert, bert_tokenizer, indicative_patterns, cfg.MPB2_config['similarity_param'], cfg.general_config['batch_size'])
    candidate_score_list = score_candidates(
        bert, bert_tokenizer, candidates,
        indicative_patterns_and_max_positions,
        indicative_patterns_and_top_suggestions)
    utils.print_expansion_with_scores_to_output_file(candidate_score_list,
                                                     log_output_file)
    return [i[0] for i in candidate_score_list]


if __name__ == "__main__":
    if type(cfg.general_config['seed']) is list:
        seed = cfg.general_config['seed']
    else:
        seed = utils.read_seeds(cfg.general_config['seed'])
    # output_file = utils.get_output_file(seed)
    output_file = utils.get_output_file(seed,
                                        description='race_without_black_white')
    model, tokenizer = model_utils.get_model_and_tokenizer_bert(
        "bert-large-uncased")
    expanded = expand_with_mpb2(seed, output_file, model, tokenizer)
    # utils.evaluate(expanded, output_file, cfg.general_config['set_file'])