def used_func_for_fast_key_word_matching():
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    keyword_processor = KeywordProcessor(case_sensitive=True)
    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")

    # Write this in a for loop to keep track of the progress
    for clean_name, keywords in tqdm(id_to_key_dict.items()):
        if not isinstance(keywords, list):
            raise AttributeError("Value of key {} should be a list".format(clean_name))

        for keyword in keywords:
            keyword_processor.add_keyword(keyword, clean_name)

    # Load data for predicting
    d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer(d_list, tok, keyword_p=keyword_processor)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl'
    d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode))
def used_func_for_fast_key_word_matching_expanded_kw():
    """
    Added on July 1.
    :return:
    """
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
    #
    keyword_processor = KeywordProcessor(case_sensitive=True)
    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")

    id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False)

    # exit(-2)

    # Write this in a for loop to keep track of the progress
    build_flashtext_processor_wit(keyword_processor, id_to_key_dict)

    # Load data for predicting
    d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer(d_list, tok, keyword_p=keyword_processor)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
    # d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
Exemplo n.º 3
0
 def create_instance(self):
     path_stanford_corenlp_full_2017_06_09 = \
         str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
     print("Load tokenizer:", path_stanford_corenlp_full_2017_06_09)
     drqa_yixin.tokenizers.set_default(
         'corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
     _tok = CoreNLPTokenizer(annotators=['pos', 'lemma'])
     self.instance = _tok
def used_func_for_building_normalized_key_word_index_for_docids():
    path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    did_list = get_all_doc_ids(str(config.FEVER_DB), max_ind=None)

    build_keyword_dict(did_list, tok,
                       config.DATA_ROOT / "id_dict.jsonl")
Exemplo n.º 5
0
def tokenized_claim_list(in_list):
    path_stanford_corenlp_full_2017_06_09 = str(
        config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa_yixin.tokenizers.set_default('corenlp_classpath',
                                      path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma'])

    for item in tqdm(in_list):
        item['claim'] = ' '.join(easy_tokenize(item['claim'], tok))

    return in_list
Exemplo n.º 6
0
def tokenized_claim(in_file, out_file):
    path_stanford_corenlp_full_2017_06_09 = str(
        config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    print(path_stanford_corenlp_full_2017_06_09)
    drqa_yixin.tokenizers.set_default('corenlp_classpath',
                                      path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma'])

    d_list = load_jsonl(in_file)
    for item in tqdm(d_list):
        item['claim'] = ' '.join(easy_tokenize(item['claim'], tok))

    save_jsonl(d_list, out_file)
Exemplo n.º 7
0
from utils import fever_db, check_sentences
import config
import drqa_yixin.tokenizers
from drqa_yixin.tokenizers import CoreNLPTokenizer
from tqdm import tqdm
from utils import c_scorer, text_clean
from utils import common

path_stanford_corenlp_full_2017_06_09 = str(
    config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
print(path_stanford_corenlp_full_2017_06_09)

drqa_yixin.tokenizers.set_default('corenlp_classpath',
                                  path_stanford_corenlp_full_2017_06_09)
tok = CoreNLPTokenizer(annotators=['pos', 'lemma'])

random.seed = 12


def easy_tokenize(text):
    return tok.tokenize(text_clean.normalize(text)).words()


def load_data(file):
    d_list = []
    with open(file, encoding='utf-8', mode='r') as in_f:
        for line in in_f:
            item = json.loads(line.strip())
            d_list.append(item)
Exemplo n.º 8
0
 def initialize_tokenizer(self):
     snlp_path = str(config.PRO_ROOT / \
         'dep_packages/stanford-corenlp-full-2017-06-09/*')
     set_default('corenlp_classpath', snlp_path)
     return CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
def used_func_for_fast_key_word_matching_prioritized_kw():
    """
    Added on July 1.
    :return:
    """
    # Load tokenizer
    path_stanford_corenlp_full_2017_06_09 = str(
        config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    drqa_yixin.tokenizers.set_default('corenlp_classpath',
                                      path_stanford_corenlp_full_2017_06_09)
    tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    # doc_tokens, doc_lemmas = parse_doc_id('Hourglass_-LRB-James_Taylor_album-RRB-', tok)
    # print(doc_tokens)
    # print(doc_lemmas)
    # print(get_words_inside_parenthese(doc_tokens))
    # print(get_words_inside_parenthese(doc_lemmas))
    # claim_t = ['album']
    # claim_l = ['album']
    # print(check_inside_paretheses_overlap(doc_tokens, doc_lemmas, claim_t, claim_l))
    # exit(-1)

    #
    keyword_processor = KeywordProcessor(case_sensitive=True)

    id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl",
                                       filtering=True)

    exact_match_rule_dict = set_priority(id_to_key_dict, priority=5.0)
    print(len(exact_match_rule_dict))

    noisy_key_dict = id_dict_key_word_expand(id_to_key_dict,
                                             create_new_key_word_dict=True)
    noisy_parenthese_rule_dict = set_priority(noisy_key_dict, priority=1.0)
    print("Noisy_Parenthese_Rule_Dict:", len(noisy_parenthese_rule_dict))

    # exit(-2)

    # Write this in a for loop to keep track of the progress
    build_flashtext_processor_with_prioritized_kw_dict(keyword_processor,
                                                       exact_match_rule_dict)
    build_flashtext_processor_with_prioritized_kw_dict(
        keyword_processor, noisy_parenthese_rule_dict)

    # Load data for predicting
    d_list = load_data(config.FEVER_TRAIN_JSONL)
    # d_list = load_data(config.FEVER_DEV_JSONL)
    sample_answer_with_priority(d_list, tok, keyword_processor, top_k=5)

    # save the the results for evaluating
    out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "train.jsonl"
    # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list, out_filename=out_fname)

    # Evaluating
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
    # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
    # d_list = load_data(out_fname)
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
    print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
import utils.wiki_term_builder
from tqdm import tqdm
import json
from utils.fever_db import get_all_doc_ids, convert_brc
from utils.c_scorer import fever_score
from utils import text_clean
from pathlib import Path
import copy
import utils
import utils.common as common

path_stanford_corenlp_full_2017_06_09 = str(
    config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
drqa_yixin.tokenizers.set_default('corenlp_classpath',
                                  path_stanford_corenlp_full_2017_06_09)
global_tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])


def memodict(f):
    """ Memoization decorator for a function taking a single argument """
    class memodict(dict):
        def __missing__(self, key):
            ret = self[key] = f(key)
            return ret

    return memodict().__getitem__


def build_keyword_dict(did_list, tokenizer, out_filename):
    out_f = open(out_filename, encoding='utf-8',
                 mode='w') if out_filename is not None else None