예제 #1
0
    def __init__(self,
                 base_path,
                 train_file,
                 dev_file,
                 test_file,
                 fasttext_path,
                 num_negatives,
                 h_max_length,
                 s_max_length,
                 random_seed,
                 reserve_embed=False,
                 db_filepath="data/fever/fever.db",
                 joint_learning=False):

        self.random_seed = random_seed

        self.base_path = base_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.fasttext_path = fasttext_path
        self.num_negatives = num_negatives
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.db_filepath = os.path.join(self.base_path, db_filepath)
        self.db = FeverDocDB(self.db_filepath)
        self.reserve_embed = reserve_embed

        self.data_pipeline()
        if joint_learning:
            self.data_for_joint()
예제 #2
0
def show_predictions(db_filename, predictions):
    """
    display claim and predicted sentences which doesn't include at least one evidence set
    :param db_filename:
    :param predictions:
    :return:
    """

    db = FeverDocDB(db_filename)

    for line in predictions:

        if line['label'].upper() != "NOT ENOUGH INFO":
            macro_rec = evidence_macro_recall(line)
            if macro_rec[0] == 1.0:
                continue
            pages = set([page for page, _ in line['predicted_evidence']])
            evidence_set = set([(page, line_num) for page, line_num in line['predicted_evidence']])
            p_lines = []
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    # print(page)
                    continue
                doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in
                             doc_lines.split("\n")]
                p_lines.extend(zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines))))

            print("claim: {}".format(line['claim']))
            print(evidence_set)
            count = 0
            for doc_line in p_lines:
                if (doc_line[1], doc_line[2]) in evidence_set:
                    print("the {}st evidence: {}".format(count, doc_line[0]))
                    count += 1
예제 #3
0
    def __init__(self,
                 embedding_path,
                 train_file,
                 dev_file,
                 test_file,
                 fasttext_path,
                 num_negatives,
                 h_max_length,
                 s_max_length,
                 random_seed,
                 reserve_embed=False,
                 db_filepath="data/fever/fever.db"):

        self.random_seed = random_seed

        self.embedding_path = embedding_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.fasttext_path = fasttext_path
        self.num_negatives = num_negatives
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.db_filepath = db_filepath
        self.db = FeverDocDB(self.db_filepath)
        self.reserve_embed = reserve_embed

        self.data_pipeline()
 def __init__(self, database_path, add_claim=False, k_wiki_results=None):
     self.db = FeverDocDB(database_path)
     self.add_claim = add_claim
     self.k_wiki_results = k_wiki_results
     self.proter_stemm = nltk.PorterStemmer()
     self.tokenizer = nltk.word_tokenize
     self.predictor = Predictor.from_path(
         "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
def in_doc_sampling(db_filename, datapath, num_sample=1):

    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X = []
    count = 0
    with open(datapath, "r") as f:
        lines = jlr.process(f)

        for line in tqdm(lines):
            count += 1
            pos_pairs = []
            # count1 += 1
            if line['label'].upper() == "NOT ENOUGH INFO":
                continue
            neg_sents = []
            claim = line['claim']

            pos_set = set()
            for evidence_set in line['evidence']:
                pos_sent = get_whole_evidence(evidence_set, db)
                if pos_sent in pos_set:
                    continue
                pos_set.add(pos_sent)

            p_lines = []
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            page_set = set([evidence[0] for evidence in evidence_set])
            for page in page_set:
                doc_lines = db.get_doc_lines(page)
                p_lines.extend(get_valid_texts(doc_lines, page))
            for doc_line in p_lines:
                if (doc_line[1], doc_line[2]) not in evidence_set:
                    neg_sents.append(doc_line[0])

            num_sampling = num_sample
            if len(neg_sents) < num_sampling:
                num_sampling = len(neg_sents)
                # print(neg_sents)
            if num_sampling == 0:
                continue
            else:
                for pos_sent in pos_set:
                    samples = random.sample(neg_sents, num_sampling)
                    for sample in samples:
                        if not sample:
                            continue
                        X.append((claim, pos_sent, sample))
                        if count % 1000 == 0:
                            print("claim:{} ,evidence :{} sample:{}".format(
                                claim, pos_sent, sample))
    return X
예제 #6
0
def load_words(embedding_file, train_datapath, test_path, db_filename,
               num_sample, sampled_path):

    words = set()

    def _insert(iterable):
        for w in iterable:
            w = Dictionary.normalize(w)
            if valid_words and w not in valid_words:
                continue
            words.add(w)

    valid_words = index_embedding_words(embedding_file)

    X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath,
                                                num_sample, sampled_path)
    X_claim = set(X_claim)
    for claim in X_claim:
        words = nltk.word_tokenize(claim)
        _insert(words)

    for sent in X_sents:
        words = simple_tokenizer(sent)
        _insert(words)

    with open(test_path, "r") as f:
        jlr = JSONLineReader()
        db = FeverDocDB(db_filename)

        lines = jlr.process(f)
        for line in lines:
            claim = line['claim']
            words = nltk.word_tokenize(claim)
            _insert(words)
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                doc_lines = [doc_line for doc_line in doc_lines if doc_line]
                for doc_line in doc_lines:
                    words = simple_tokenizer(doc_line)
                    _insert(words)
    return words
예제 #7
0
def test_data_4_siamese(db_filename, dataset_path):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X_claims = []
    X_sents = []
    all_sents_id = []

    with open(dataset_path, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            claims = []
            sents = []
            sents_indexes = []
            p_lines = []
            claim = line['claim']
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                else:
                    claims.append(claim)
                    sents.append(doc_line[0])
                    sents_indexes.append((doc_line[1], doc_line[2]))
            X_claims.append(claims)
            X_sents.append(sents)
            all_sents_id.append(sents_indexes)
    # print(len(X_claims))
    # print(len(X_sents))
    # print(len(all_sents_id))
    # X_claims_indexes, X_sents_indexes = [], []
    # for idx, claims in enumerate(X_claims):
    #     claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict)
    #     X_claims_indexes.append(claims_index)
    #     X_sents_indexes.append(sents_index)

    return X_claims, X_sents, all_sents_id
예제 #8
0
 def __init__(self,
              fasttext_path,
              h_max_length,
              s_max_length,
              reserve_embed=False,
              model_location=os.getcwd(),
              db_filepath="data/fever/fever.db"):
     self.model_location = model_location
     self.fasttext_path = fasttext_path
     self.h_max_length = h_max_length
     self.s_max_length = s_max_length
     self.db = FeverDocDB(db_filepath)
     self.reserve_embed = reserve_embed
def _create_token_set_of_db(db):
    logger.debug("start creating token set for DB...")
    if type(db) == str:
        db = FeverDocDB(db)
    _token_set = set()
    for doc_id in tqdm(db.get_non_empty_doc_ids()):
        doc_lines = db.get_doc_lines(doc_id)
        for line in doc_lines:
            tokens = tokenize(clean_text(line))
            for token in tokens:
                if token.lower() in _token_set:
                    continue
                _token_set.add(token.lower())
    return _token_set
예제 #10
0
def number_feature(data_set_path: str, db_path: str, max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    db = FeverDocDB(db_path)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32)
    for i, line in enumerate(lines):
        claim_text = line['claim']
        claim_tokens = tokenize(claim_text)
        all_nums = set()
        for token in claim_tokens:
            if is_token_numeric(token):
                all_nums.add(float(token))
        for j, evidence in enumerate(line['predicted_evidence']):
            if j >= max_sent_num:
                break
            page, line_num = evidence[-2], evidence[-1]
            all_evidence_nums = []
            evidence_text = evidence_num_to_text(db, page, line_num)
            evidence_tokens = tokenize(evidence_text)
            for token in evidence_tokens:
                if is_token_numeric(token):
                    all_evidence_nums.append(float(token))
            has_num = len(all_evidence_nums) > 0
            has_identical_num = any(n in all_nums for n in all_evidence_nums)
            has_different_num = any(n not in all_nums
                                    for n in all_evidence_nums)
            num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][
                2] = _interprete_num_result(has_num, has_identical_num,
                                            has_different_num)
    return num_feat
    def __init__(self,base_path,train_file,dev_file,test_file,num_negatives,h_max_length,s_max_length,random_seed=100,db_filepath="data/fever/fever.db"):

        self.random_seed = random_seed


        self.base_path = base_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.num_negatives = num_negatives
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.db_filepath = os.path.join(self.base_path,db_filepath)
        self.db = FeverDocDB(self.db_filepath)

        self.data_pipeline()
def _create_db_vocab_idx(db, _global_dict):
    # logger = LogHelper.get_logger("_create_db_vocab_idx")
    logger.debug("start creating vocab indices for DB...")
    if type(db) == str:
        db = FeverDocDB(db)
    _vocab_idx = {}
    for doc_id in tqdm(db.get_non_empty_doc_ids()):
        doc_lines = db.get_doc_lines(doc_id)
        for line in doc_lines:
            tokens = tokenize(clean_text(line))
            for token in tokens:
                if token.lower() in _vocab_idx:
                    continue
                if token.lower() in _global_dict:
                    _vocab_idx[token.lower()] = _global_dict[token.lower()]
    _vocab_idx = sorted(list(_vocab_idx.values()))
    return _vocab_idx
def dev_processing(db_filename, lines):

    db = FeverDocDB(db_filename)
    claims = []
    list_sents = []
    labels = []

    for line in tqdm(lines):
        if line['label'].upper() == "NOT ENOUGH INFO":
            continue

        claims.append(line['claim'])
        sents = []
        label = []

        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in line['evidence']
                            for evidence in evidences])
        pages = [
            page[0] for page in line['predicted_pages'] if page[0] is not None
        ]
        for page, num in evidence_set:
            pages.append(page)
        pages = set(pages)

        p_lines = []
        for page in pages:
            doc_lines = db.get_doc_lines(page)
            p_lines.extend(get_valid_texts(doc_lines, page))
        for doc_line in p_lines:
            if not doc_line[0]:
                continue
            if (doc_line[1], doc_line[2]) in evidence_set:
                sents.append(doc_line[0])
                label.append(1)
            else:
                sents.append(doc_line[0])
                label.append(0)
        if len(claims) == 0 or len(list_sents) == 0 or len(labels) == 0:
            continue
        list_sents.append(sents)
        labels.append(label)
    return claims, list_sents, labels
예제 #14
0
    def from_params(cls, params: Params) -> 'FEVERSentenceReader':
        claim_tokenizer = Tokenizer.from_params(params.pop('claim_tokenizer', {}))
        wiki_tokenizer = Tokenizer.from_params(params.pop('wiki_tokenizer', {}))

        token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
        db = FeverDocDB(params.pop("db_path","data/fever/fever.db"))
        params.assert_empty(cls.__name__)
        return FEVERSentenceReader(db=db,
                           claim_tokenizer=claim_tokenizer,
                           wiki_tokenizer=wiki_tokenizer,
                           token_indexers=token_indexers)
예제 #15
0
def eval_da(dataset_to_work_on, args, operation, mithun_logger):
    LogHelper.setup()
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    params = Params.from_file(args.param_path, args.overrides)
    uofa_params = params.pop('uofa_params', {})
    path_to_saved_db = uofa_params.pop("path_to_saved_db")
    db = FeverDocDB(path_to_saved_db)

    mithun_logger.info("inside main function going to call eval on " +
                       str(dataset_to_work_on))
    mithun_logger.info("path_to_pyproc_annotated_data_folder " +
                       str(path_to_pyproc_annotated_data_folder))
    mithun_logger.info("value of name_of_trained_model_to_use: " +
                       str(name_of_trained_model_to_use))
    mithun_logger.info("value of dataset_to_work_on: " +
                       str(dataset_to_work_on))

    if (dataset_to_work_on == "fnc"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})
        path_to_fnc_annotated_data = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        eval_model_fnc_data(db, args, mithun_logger,
                            name_of_trained_model_to_use,
                            path_to_trained_models_folder, cuda_device,
                            operation, path_to_fnc_annotated_data)

    elif (dataset_to_work_on == "fever"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})

        eval_model(db, args, mithun_logger, path_to_trained_models_folder,
                   name_of_trained_model_to_use)
def test_processing(db_filename, lines):
    db = FeverDocDB(db_filename)
    claims = []
    list_sents = []
    sents_indexes = []

    for line in tqdm(lines):
        # if line['label'].upper() == "NOT ENOUGH INFO":
        #     continue

        claims.append(line['claim'])
        sents = []
        sents_index = []

        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in line['evidence']
                            for evidence in evidences])
        pages = set([
            page[0] for page in line['predicted_pages'] if page[0] is not None
        ])
        if len(pages) == 0:
            pages.add("Michael_Hutchence")

        p_lines = []
        for page in pages:
            doc_lines = db.get_doc_lines(page)
            p_lines.extend(get_valid_texts(doc_lines, page))
        for doc_line in p_lines:
            if not doc_line[0]:
                continue
            if (doc_line[1], doc_line[2]) in evidence_set:
                sents.append(doc_line[0])
            else:
                sents.append(doc_line[0])
            sents_index.append((doc_line[1], doc_line[2]))
        list_sents.append(sents)
        sents_indexes.append(sents_index)
    return claims, list_sents, sents_indexes
def dev_processing(db_filename, datapath):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    devs = []
    all_indexes = []

    with open(datapath, "rb") as f:
        lines = jlr.process(f)

        for line in tqdm(lines):
            dev = []
            indexes = []
            pages = set()
            pages.update(page[0] for page in line['predicted_pages'])
            if len(pages) == 0:
                pages.add("Michael_Hutchence")
            claim = line['claim']
            p_lines = []
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                p_lines.extend(get_valid_texts(doc_lines, page))

            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                dev.append((claim, doc_line[0]))
                indexes.append((doc_line[1], doc_line[2]))
            # print(len(dev))
            if len(dev) == 0:
                dev.append((claim, 'no evidence for this claim'))
                indexes.append(('empty', 0))
            devs.append(dev)
            all_indexes.append(indexes)
    return devs, all_indexes
    def from_params(cls, params: Params) -> 'FEVERReader':
        claim_tokenizer = Tokenizer.from_params(
            params.pop('claim_tokenizer', {}))
        wiki_tokenizer = Tokenizer.from_params(params.pop(
            'wiki_tokenizer', {}))

        token_indexers = TokenIndexer.dict_from_params(
            params.pop('token_indexers', {}))
        sentence_level = params.pop("sentence_level", False)
        is_snopes = params.pop("is_snopes", False)
        if is_snopes:
            db = SnopesDocDB(params.pop("db_path",
                                        "dataset/snopes.pages.json"))
        else:
            db = FeverDocDB(params.pop("db_path", "data/fever.db"))
        params.assert_empty(cls.__name__)
        return FEVERReader(db=db,
                           sentence_level=sentence_level,
                           claim_tokenizer=claim_tokenizer,
                           wiki_tokenizer=wiki_tokenizer,
                           token_indexers=token_indexers)
예제 #19
0
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    parser = argparse.ArgumentParser()
    parser.add_argument('db', type=str, help='/path/to/saved/db.db')
    parser.add_argument(
        'param_path',
        type=str,
        help='path to parameter file describing the model to be trained')

    parser.add_argument("logdir", type=str)

    parser.add_argument("--filtering", type=str, default=None)
    parser.add_argument("--cuda-device",
                        type=int,
                        default=None,
                        help='id of GPU to use (if any)')
    parser.add_argument(
        '-o',
        '--overrides',
        type=str,
        default="",
        help='a HOCON structure used to override the experiment configuration')

    args = parser.parse_args()

    db = FeverDocDB(args.db)

    params = Params.from_file(args.param_path, args.overrides)

    train_model(db, params, args.cuda_device, args.logdir, args.filtering)
예제 #20
0
def read_data_set_from_jsonl(file_path: str,
                             db: Union[str, FeverDocDB],
                             predicted: bool = True,
                             num_sentences=None,
                             is_snopes=False):
    logger = LogHelper.get_logger("read_data_set_from_jsonl")
    if not is_snopes:
        if type(db) is str:
            db = FeverDocDB(db)
    else:
        with open(db) as f:
            db = json.load(f)
    with open(file_path, 'r') as f:
        claims = []
        evidences = []
        paths = []
        labels = []
        ids = []
        for line in tqdm(f):
            json_obj = json.loads(line)
            if predicted:
                evidences_texts = []
                if 'predicted_evidence' in json_obj:
                    _evidences = json_obj['predicted_evidence']
                elif 'predicted_sentences' in json_obj:
                    _evidences = json_obj['predicted_sentences']
                else:
                    _evidences = []
                if len(_evidences) > 0:
                    for sent in _evidences:
                        page, line_num = sent[-2], sent[-1]
                        page_title = page.replace("_", " ")
                        evidences_texts.append(
                            # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                            clean_text(
                                evidence_num_to_text(db, page, line_num,
                                                     is_snopes)))
            else:
                evidences_texts = set()
                _evidences = json_obj['evidence']
                for evidence in _evidences:
                    for sent in evidence:
                        page, line_num = sent[-2], sent[-1]
                        page_title = page.replace("_", " ")
                        evidences_texts.add(
                            # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes)))
                            clean_text(
                                evidence_num_to_text(db, page, line_num,
                                                     is_snopes)))
                evidences_texts = list(evidences_texts)
            if len(evidences_texts) == 0:
                continue
            if num_sentences is not None:
                if len(evidences_texts) > num_sentences:
                    evidences_texts = evidences_texts[:num_sentences]
            claims.append(clean_text(json_obj['claim']))
            if 'label' in json_obj:
                labels.append(label_dict.index(json_obj['label']))
            evidences.append(evidences_texts)
            if 'paths' in json_obj:
                paths_from_sent_to_claim = [
                    1.0 if p else 0.0 for p in json_obj['paths']
                ]
                if num_sentences is not None and num_sentences > len(
                        paths_from_sent_to_claim):
                    paths_from_sent_to_claim += [0.0] * (
                        num_sentences - len(paths_from_sent_to_claim))
                paths.append(paths_from_sent_to_claim)
            ids.append(json_obj['id'])
        datas = {'h': claims, 'b': evidences, 'id': ids}
        if paths:
            datas['paths'] = paths
        return datas, labels
예제 #21
0
from retrieval.fever_doc_db import FeverDocDB
from retrieval.sentence import FEVERSentenceRelatednessFormatter, FEVERSentenceFormatter, FEVERSentenceTextFormatter
from scripts.retrieval.sentence.mlp_train import RelatedLabelSchema

nlp = spacy.load("en", create_pipeline=wmd.WMD.create_spacy_pipeline)


def wmd_sim(claim, lines):
    cl = nlp(claim)
    scores = []
    for line in lines:
        scores.append(cl.similarity(nlp(line)))
    return scores


db = FeverDocDB("data/fever/fever.db")
idx = set(db.get_doc_ids())

jlr = JSONLineReader()
formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema())
dev_ds = DataSet(file="data/fever-data/dev.jsonl",
                 reader=jlr,
                 formatter=formatter)

dev_ds.read()


def doc_lines(db, doc):
    lines = db.get_doc_lines(doc)
    return [
        line.split("\t")[1] if len(line.split("\t")) > 1 else ""
예제 #22
0
class Data(object):
    def __init__(self,
                 embedding_path,
                 train_file,
                 dev_file,
                 test_file,
                 fasttext_path,
                 num_negatives,
                 h_max_length,
                 s_max_length,
                 random_seed,
                 reserve_embed=False,
                 db_filepath="data/fever/fever.db"):

        self.random_seed = random_seed

        self.embedding_path = embedding_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.fasttext_path = fasttext_path
        self.num_negatives = num_negatives
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.db_filepath = db_filepath
        self.db = FeverDocDB(self.db_filepath)
        self.reserve_embed = reserve_embed

        self.data_pipeline()

    def data_pipeline(self):

        np.random.seed(self.random_seed)
        random.seed(self.random_seed)

        # create diretory to store sampling data and processed data
        # store_dir = "data.h{}.s{}.seed{}".format(self.h_max_length, self.s_max_length, self.random_seed)
        # self.absou_dir = os.path.join(base_dir, store_dir)
        os.makedirs(self.embedding_path, exist_ok=True)

        train_data_path = os.path.join(self.embedding_path, "train_sample.p")
        X_train = self.train_data_loader(train_data_path,
                                         self.train_file,
                                         num_samples=self.num_negatives)
        dev_datapath = os.path.join(self.embedding_path, "dev_data.p")
        devs, self.dev_labels = self.dev_data_loader(dev_datapath,
                                                     self.dev_file)
        if self.test_file is None:
            self.test_file = self.dev_file

        test_datapath = os.path.join(self.embedding_path, "test_data.p")
        tests, self.test_location_indexes = self.predict_data_loader(
            test_datapath, self.test_file)

        words_dict_path = os.path.join(self.embedding_path, "words_dict.p")
        if os.path.exists(words_dict_path):
            with open(words_dict_path, "rb") as f:
                self.word_dict = pickle.load(f)
        else:
            self.word_dict = self.get_complete_words(words_dict_path, X_train,
                                                     devs, tests)

        self.iword_dict = self.inverse_word_dict(self.word_dict)

        train_indexes_path = os.path.join(self.embedding_path,
                                          "train_indexes.p")
        self.X_train_indexes = self.train_indexes_loader(
            train_indexes_path, X_train)
        dev_indexes_path = os.path.join(self.embedding_path, "dev_indexes.p")
        self.dev_indexes = self.predict_indexes_loader(dev_indexes_path, devs)
        test_indexes_path = os.path.join(self.embedding_path, "test_indexes.p")
        self.test_indexes = self.predict_indexes_loader(
            test_indexes_path, tests)

        embed_dict = self.load_fasttext(self.iword_dict)
        print("embed_dict size {}".format(len(embed_dict)))
        _PAD_ = len(self.word_dict)
        self.word_dict[_PAD_] = '[PAD]'
        self.iword_dict['[PAD]'] = _PAD_
        self.embed = self.embed_to_numpy(embed_dict)

        return self

    def get_whole_evidence(self, evidence_set, db):
        pos_sents = []
        for evidence in evidence_set:
            page = evidence[2]
            doc_lines = db.get_doc_lines(page)
            doc_lines = self.get_valid_texts(doc_lines, page)
            for doc_line in doc_lines:
                if doc_line[2] == evidence[3]:
                    pos_sents.append(doc_line[0])
        pos_sent = ' '.join(pos_sents)
        return pos_sent

    def get_valid_texts(self, lines, page):
        if not lines:
            return []
        doc_lines = [
            doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else ""
            for doc_line in lines.split("\n")
        ]
        doc_lines = list(
            zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines))))
        return doc_lines

    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                count += 1
                pos_pairs = []
                # count1 += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']

                pos_set = set()
                for evidence_set in line['evidence']:
                    pos_sent = self.get_whole_evidence(evidence_set, self.db)
                    if pos_sent in pos_set:
                        continue
                    pos_set.add(pos_sent)

                p_lines = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]

                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if (doc_line[1], doc_line[2]) not in evidence_set:
                        neg_sents.append(doc_line[0])

                num_sampling = num_sample
                if len(neg_sents) < num_sampling:
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            # if count % 1000 == 0:
                            #     print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample))
        return X

    def predict_processing(self, datapath):

        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(datapath, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                pages = set()
                # pages = line['predicted_pages']
                pages.update(page for page in line['predicted_pages'])
                # if len(pages) == 0:
                #     pages.add("Michael_Hutchence")
                claim = line['claim']
                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    if not doc_lines:
                        continue
                    p_lines.extend(self.get_valid_texts(doc_lines, page))

                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((claim, doc_line[0]))
                    indexes.append((doc_line[1], doc_line[2]))
                # print(len(dev))
                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))
                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes

    def dev_processing(self, data_path):

        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

                dev = []
                label = []
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        label.append(1)
                    else:
                        label.append(0)
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)
        return devs, labels

    def train_data_loader(self, train_sampled_path, data_path, num_samples=1):

        if os.path.exists(train_sampled_path):
            with open(train_sampled_path, 'rb') as f:
                X = pickle.load(f)
        else:
            X = self.sampling(data_path, num_samples)
            with open(train_sampled_path, 'wb') as f:
                pickle.dump(X, f)
        return X

    def dev_data_loader(self, dev_data_path, data_path):

        if os.path.exists(dev_data_path):
            with open(dev_data_path, "rb") as f:
                data = pickle.load(f)
                devs, labels = zip(*data)
        else:
            devs, labels = self.dev_processing(data_path)
            data = list(zip(devs, labels))
            with open(dev_data_path, 'wb') as f:
                pickle.dump(data, f)
        return devs, labels

    def predict_data_loader(self, predict_data_path, data_path):

        if os.path.exists(predict_data_path):
            print(predict_data_path)
            with open(predict_data_path, "rb") as f:
                data = pickle.load(f)
                devs, location_indexes = zip(*data)
        else:
            devs, location_indexes = self.predict_processing(data_path)
            data = list(zip(devs, location_indexes))
            with open(predict_data_path, 'wb') as f:
                pickle.dump(data, f)
        return devs, location_indexes

    def sent_processing(self, sent):
        sent = sent.replace('\n', '')
        sent = sent.replace('-', ' ')
        sent = sent.replace('/', ' ')
        return sent

    def nltk_tokenizer(self, sent):
        # sent = sent_processing(sent)
        return nltk.word_tokenize(sent)

    def get_words(self, claims, sents):

        words = set()
        for claim in claims:
            for idx, word in enumerate(self.nltk_tokenizer(claim)):
                if idx >= self.h_max_length:
                    break
                words.add(word.lower())
        for sent in sents:
            for idx, word in enumerate(self.nltk_tokenizer(sent)):
                if idx >= self.s_max_length:
                    break
                words.add(word.lower())
        return words

    def get_train_words(self, X):
        claims = set()
        sents = []
        for claim, pos, neg in X:
            claims.add(claim)
            sents.append(pos)
            sents.append(neg)

        train_words = self.get_words(claims, sents)
        print("training words processing done!")
        return train_words

    def get_predict_words(self, devs):
        dev_words = set()
        # nlp = StanfordCoreNLP(corenlp_path)
        for dev in tqdm(devs):
            claims = set()
            sents = []
            for pair in dev:
                claims.add(pair[0])
                sents.append(pair[1])
            dev_tokens = self.get_words(claims, sents)
            dev_words.update(dev_tokens)
        print("dev_words processing done!")
        return dev_words

    def word_2_dict(self, words):
        word_dict = {}
        for idx, word in enumerate(words):
            word = word.replace('\n', '')
            word = word.replace('\t', '')
            word_dict[idx] = word

        return word_dict

    def inverse_word_dict(self, word_dict):

        iword_dict = {}
        for key, word in word_dict.items():
            iword_dict[word] = key
        return iword_dict

    def load_fasttext(self, iword_dict):

        embed_dict = {}
        print(self.fasttext_path)
        model = FastText(self.fasttext_path)
        for word, key in iword_dict.items():
            embed_dict[key] = model[word]
            # print(embed_dict[key])
        print('Embedding size: %d' % (len(embed_dict)))
        return embed_dict

    def embed_to_numpy(self, embed_dict):

        feat_size = len(embed_dict[list(embed_dict.keys())[0]])
        if self.reserve_embed:
            embed = np.zeros((len(embed_dict) + 200000, feat_size), np.float32)
        else:
            embed = np.zeros((len(embed_dict), feat_size), np.float32)
        for k in embed_dict:
            embed[k] = np.asarray(embed_dict[k])
        print('Generate numpy embed:', embed.shape)

        return embed

    def sent_2_index(self, sent, word_dict, max_length):
        words = self.nltk_tokenizer(sent)
        word_indexes = []
        for idx, word in enumerate(words):
            if idx >= max_length:
                break
            else:
                word_indexes.append(word_dict[word.lower()])
        return word_indexes

    def train_data_indexes(self, X, word_dict):

        X_indexes = []
        print("start index words into intergers")
        for claim, pos, neg in X:
            claim_indexes = self.sent_2_index(claim, word_dict,
                                              self.h_max_length)
            pos_indexes = self.sent_2_index(pos, word_dict, self.s_max_length)
            neg_indexes = self.sent_2_index(neg, word_dict, self.s_max_length)
            X_indexes.append((claim_indexes, pos_indexes, neg_indexes))
        print('Training data size:', len(X_indexes))
        return X_indexes

    def predict_data_indexes(self, data, word_dict):

        devs_indexes = []
        for dev in data:
            sent_indexes = []
            claim = dev[0][0]
            claim_index = self.sent_2_index(claim, word_dict,
                                            self.h_max_length)
            claim_indexes = [claim_index] * len(dev)
            for claim, sent in dev:
                sent_index = self.sent_2_index(sent, word_dict,
                                               self.s_max_length)
                sent_indexes.append(sent_index)
            assert len(sent_indexes) == len(claim_indexes)
            dev_indexes = list(zip(claim_indexes, sent_indexes))
            devs_indexes.append(dev_indexes)
        return devs_indexes

    def get_complete_words(self, words_dict_path, train_data, dev_data,
                           test_data):

        all_words = set()
        train_words = self.get_train_words(train_data)
        all_words.update(train_words)
        dev_words = self.get_predict_words(dev_data)
        all_words.update(dev_words)
        test_words = self.get_predict_words(test_data)
        all_words.update(test_words)
        word_dict = self.word_2_dict(all_words)
        with open(words_dict_path, "wb") as f:
            pickle.dump(word_dict, f)

        return word_dict

    def train_indexes_loader(self, train_indexes_path, train_data):

        if os.path.exists(train_indexes_path):
            with open(train_indexes_path, "rb") as f:
                X_indexes = pickle.load(f)
        else:
            X_indexes = self.train_data_indexes(train_data, self.iword_dict)
            with open(train_indexes_path, "wb") as f:
                pickle.dump(X_indexes, f)
        return X_indexes

    def predict_indexes_loader(self, predict_indexes_path, predict_data):

        if os.path.exists(predict_indexes_path):
            with open(predict_indexes_path, "rb") as f:
                predicts_indexes = pickle.load(f)
        else:
            predicts_indexes = self.predict_data_indexes(
                predict_data, self.iword_dict)
            with open(predict_indexes_path, "wb") as f:
                pickle.dump(predicts_indexes, f)
        return predicts_indexes

    def update_word_dict(self, test_path):

        self.new_test_datapath = os.path.join(self.embedding_path,
                                              "new_test_data.p")
        new_tests, self.test_location_indexes = self.predict_data_loader(
            self.new_test_datapath, test_path)

        new_test_words = self.get_predict_words(new_tests)
        print(len(self.iword_dict))
        print(len(self.word_dict))
        self.test_words_dict = {}
        for word in new_test_words:
            if word not in self.iword_dict:
                idx = len(self.word_dict)
                self.word_dict[idx] = word
                self.test_words_dict[idx] = word

        self.iword_dict = self.inverse_word_dict(self.word_dict)
        self.test_iword_dict = self.inverse_word_dict(self.test_words_dict)

        print("updated iword dict size: ", len(self.iword_dict))
        print("test iword dict size: ", len(self.test_iword_dict))

    def update_embeddings(self):

        test_embed_dict = self.load_fasttext(self.test_iword_dict)

        for k in test_embed_dict:
            self.embed[k] = np.asarray(test_embed_dict[k])
        print("updated embed size: ", self.embed.shape)

    def get_new_test_indexes(self, test_path):

        new_tests, self.new_test_location_indexes = self.predict_data_loader(
            self.new_test_datapath, test_path)

        new_tests_indexes_path = os.path.join(self.embedding_path,
                                              "new_test_indexes.p")
        self.new_tests_indexes = self.predict_indexes_loader(
            new_tests_indexes_path, new_tests)
예제 #23
0
    parser.add_argument('--docs', type=str, default='data/tmp/doc_preds.jsonl', help='file containing the predicted documents', )
    parser.add_argument('--sen_output', type=str, default='data/tmp/sen_preds.jsonl', help='file to write predicted sentences to', )
    parser.add_argument('--train_ds', type=str, default='data/fever-data/train.jsonl', help='training dataset', )
    parser.add_argument('--dev_ds', type=str, default='data/fever-data/dev.jsonl', help='development dataset', )

    args = parser.parse_args()

    out_dir = os.path.dirname(os.path.realpath(args.output))
    os.makedirs(out_dir, exist_ok=True)

    ##### DOCUMENT RETREIVAL IMPLEMENTATION #####
    doc_retriever = DrqaDocRetriever(args.model)
    #############################################

    logger.info("Load DB")
    db = FeverDocDB(args.db)
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    logger.info("Read datasets")
    train_ds = DataSet(file=args.train_ds, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev_ds, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    logger.info("Generate vocab for TF-IDF")
    tf = XTermFrequencyFeatureFunction(db)
    tf.inform(train_ds.data, dev_ds.data)

    ##### SENTENCE RETREIVAL IMPLEMENTATION ############
예제 #24
0
            evidence_pages.add(sample)
        if len(evidence_pages) < k:
            samples = random.sample(doc_ids, k - len(evidence_pages))
            for sample in samples:
                evidence_pages.add(sample)

    elif len(evidence_pages) >= k:

        samples = random.sample(evidence_pages, k)
        evidence_pages = set(samples)
    return evidence_pages


path = os.getcwd()
path = re.sub("/src.*", "", path)
db = FeverDocDB(os.path.join(path, "data/fever/fever.db"))
doc_ids = db.get_doc_ids()
doc_ids = doc_ids[1:]
jlr = JSONLineReader()
# with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f:
#     with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2:
#         lines = f.readlines()
#         for line in lines:
#             js = json.loads(line)
#             pages = sample_doc(js,doc_ids,k=5)
#             js['predicted_pages'] = list(pages)
#             f2.write(json.dumps(js)+"\n")

with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f:
    with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2:
        lines = f.readlines()
def train_sample(db_filename, lines, list_size=15):

    db = FeverDocDB(db_filename)

    claims = []
    list_sents = []
    labels = []
    count = 0

    for idx, line in tqdm(enumerate(lines)):

        if line['label'].upper() == "NOT ENOUGH INFO":
            continue

        claim = line['claim']
        claims.append(claim)
        sents = []
        label = []

        pos_set = set()
        neg_sents = []
        for evidence_group in line['evidence']:
            pos_sent = get_whole_evidence(evidence_group, db)
            if pos_sent in pos_set:
                continue
            pos_set.add(pos_sent)

        p_lines = []
        evidence_set = set([(evidence[2], evidence[3])
                            for evidences in line['evidence']
                            for evidence in evidences])

        pages = [
            page[0] for page in line['predicted_pages'] if page[0] is not None
        ]
        for page, num in evidence_set:
            pages.append(page)
        pages = set(pages)
        for page in pages:
            doc_lines = db.get_doc_lines(page)
            p_lines.extend(get_valid_texts(doc_lines, page))
        for doc_line in p_lines:
            if not doc_line[0]:
                continue
            if (doc_line[1], doc_line[2]) not in evidence_set:
                neg_sents.append(doc_line[0])

        pos_set = list(pos_set)
        if len(pos_set) > 5:
            pos_set = random.sample(pos_set, 5)
        if len(neg_sents) < (list_size - len(pos_set)):

            count += 1
            continue
        else:
            samples = random.sample(neg_sents, list_size - len(pos_set))
            pos_indexes_sample = random.sample(range(list_size), len(pos_set))
            neg_index = 0
            pos_index = 0
            for i in range(list_size):
                if i in pos_indexes_sample:
                    sents.append(pos_set[pos_index])
                    label.append(1 / len(pos_set))
                    pos_index += 1
                else:
                    sents.append(samples[neg_index])
                    label.append(0.0)
                    neg_index += 1
            if idx % 1000 == 0:
                print(claim)
                print(sents)
                print(label)

        list_sents.append(sents)
        labels.append(label)
    print(count)
    return claims, list_sents, labels
예제 #26
0
class ELMO_Data(object):
    def __init__(self,
                 base_path,
                 train_file,
                 dev_file,
                 test_file,
                 num_negatives,
                 h_max_length,
                 s_max_length,
                 random_seed=100,
                 db_filepath="data/fever/fever.db"):

        self.random_seed = random_seed

        self.base_path = base_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.num_negatives = num_negatives
        self.h_max_length = h_max_length
        self.s_max_length = s_max_length
        self.db_filepath = db_filepath
        self.db = FeverDocDB(self.db_filepath)

        self.data_pipeline()

    def data_pipeline(self):

        np.random.seed(self.random_seed)
        random.seed(self.random_seed)

        # create diretory to store sampling data and processed data
        base_dir = os.path.join(self.base_path, "data/train_data")
        store_dir = "data.h{}.s{}.seed{}".format(self.h_max_length,
                                                 self.s_max_length,
                                                 self.random_seed)
        absou_dir = os.path.join(base_dir, store_dir)
        if not os.path.exists(absou_dir):
            os.makedirs(absou_dir)

        train_data_path = os.path.join(absou_dir, "train_sample.p")
        X_train = self.train_data_loader(train_data_path,
                                         self.train_file,
                                         num_samples=self.num_negatives)
        dev_datapath = os.path.join(absou_dir, "dev_data.p")
        devs, self.dev_labels = self.dev_data_loader(dev_datapath,
                                                     self.dev_file)
        test_datapath = os.path.join(absou_dir, "test_data.p")
        tests, self.test_location_indexes = self.predict_data_loader(
            test_datapath, self.test_file)

        self.X_train = self.train_data_tokenizer(X_train)
        self.devs = self.predict_data_tokenizer(devs)
        self.tests = self.predict_data_tokenizer(tests)

        return self

    def get_whole_evidence(self, evidence_set, db):
        pos_sents = []
        for evidence in evidence_set:
            page = evidence[2]
            doc_lines = db.get_doc_lines(page)
            doc_lines = self.get_valid_texts(doc_lines, page)
            for doc_line in doc_lines:
                if doc_line[2] == evidence[3]:
                    pos_sents.append(doc_line[0])
        pos_sent = ' '.join(pos_sents)
        return pos_sent

    def get_valid_texts(self, lines, page):
        if not lines:
            return []
        doc_lines = [
            doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else ""
            for doc_line in lines.split("\n")
        ]
        doc_lines = zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines)))
        return doc_lines

    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                count += 1
                pos_pairs = []
                # count1 += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']

                pos_set = set()
                for evidence_set in line['evidence']:
                    pos_sent = self.get_whole_evidence(evidence_set, self.db)
                    if pos_sent in pos_set:
                        continue
                    pos_set.add(pos_sent)

                p_lines = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]

                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if (doc_line[1], doc_line[2]) not in evidence_set:
                        neg_sents.append(doc_line[0])

                num_sampling = num_sample
                if len(neg_sents) < num_sampling:
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            if count % 1000 == 0:
                                print(
                                    "claim:{} ,evidence :{} sample:{}".format(
                                        claim, pos_sent, sample))
        return X

    def predict_processing(self, datapath):

        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(datapath, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                pages = set()
                # pages = line['predicted_pages']
                pages.update(page for page in line['predicted_pages'])
                # if len(pages) == 0:
                #     pages.add("Michael_Hutchence")
                claim = line['claim']
                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    if not doc_lines:
                        continue
                    p_lines.extend(self.get_valid_texts(doc_lines, page))

                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((claim, doc_line[0]))
                    indexes.append((doc_line[1], doc_line[2]))
                # print(len(dev))
                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))
                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes

    def dev_processing(self, data_path):

        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

                dev = []
                label = []
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        label.append(1)
                    else:
                        label.append(0)
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)
        return devs, labels

    def train_data_loader(self, train_sampled_path, data_path, num_samples=1):

        if os.path.exists(train_sampled_path):
            with open(train_sampled_path, 'rb') as f:
                X = pickle.load(f)
        else:
            X = self.sampling(data_path, num_samples)
            with open(train_sampled_path, 'wb') as f:
                pickle.dump(X, f)
        return X

    def dev_data_loader(self, dev_data_path, data_path):

        if os.path.exists(dev_data_path):
            with open(dev_data_path, "rb") as f:
                data = pickle.load(f)
                devs, labels = zip(*data)
        else:
            devs, labels = self.dev_processing(data_path)
            data = zip(devs, labels)
            with open(dev_data_path, 'wb') as f:
                pickle.dump(data, f)
        return devs, labels

    def predict_data_loader(self, predict_data_path, data_path):

        if os.path.exists(predict_data_path):
            print(predict_data_path)
            with open(predict_data_path, "rb") as f:
                data = pickle.load(f)
                devs, location_indexes = zip(*data)
        else:
            devs, location_indexes = self.predict_processing(data_path)
            data = zip(devs, location_indexes)
            with open(predict_data_path, 'wb') as f:
                pickle.dump(data, f)
        return devs, location_indexes

    def sent_processing(self, sent):
        sent = sent.replace('\n', '')
        sent = sent.replace('-', ' ')
        sent = sent.replace('/', ' ')
        return sent

    def nltk_tokenizer(self, sent):
        # sent = sent_processing(sent)
        return nltk.word_tokenize(sent)

    def proess_sents(self, sents, max_length):

        tokenized_sents = []
        sents_lengths = []
        for sent in sents:
            words = [word.lower() for word in nltk.word_tokenize(sent)]
            if len(words) < self.h_max_length:
                sents_lengths.append(len(words))
                words.extend([""] * (self.h_max_length - len(words)))
                tokenized_sents.append(words)
            else:
                sents_lengths.append(self.h_max_length)
                words = words[:self.h_max_length]
                tokenized_sents.append(words)
        return tokenized_sents, sents_lengths

    def train_data_tokenizer(self, X_train):

        claims = [claim for claim, _, _ in X_train]
        pos_sents = [pos_sent for _, pos_sent, _ in X_train]
        neg_sents = [neg_sent for _, _, neg_sent in X_train]

        tokenized_claims, claims_lengths = self.proess_sents(
            claims, self.h_max_length)
        tokenized_pos_sents, pos_sents_lengths = self.proess_sents(
            pos_sents, self.s_max_length)
        tokenized_neg_sents, neg_sents_lengths = self.proess_sents(
            neg_sents, self.s_max_length)

        new_claims = list(zip(tokenized_claims, claims_lengths))
        new_pos_sents = list(zip(tokenized_pos_sents, pos_sents_lengths))
        new_neg_sents = list(zip(tokenized_neg_sents, neg_sents_lengths))

        return list(zip(new_claims, new_pos_sents, new_neg_sents))

    def predict_data_tokenizer(self, dataset):

        predict_data = []
        for data in dataset:
            claims = [claim for claim, _ in data]
            sents = [sent for _, sent in data]

            tokenized_claims, claims_lengths = self.proess_sents(
                claims, self.h_max_length)
            tokenized_sents, sents_lengths = self.proess_sents(
                sents, self.s_max_length)

            new_claims = list(zip(tokenized_claims, claims_lengths))
            new_sents = list(zip(tokenized_sents, sents_lengths))

            tokenized_data = list(zip(new_claims, new_sents))
            predict_data.append(tokenized_data)
        return predict_data
예제 #27
0
def sample_ranking_train(db_filename, datapath, k=5, num_sample=2):
    """

    :param db_filename: path stores wiki-pages database
    :param datapath: path stores fever predicted pages train set
    :param k: number of sentences where to select negative examples
    :param num_sample: number of negative examples to sample
    :return: X: claim and sentence pairs y: if the sentence in evidence set
    """

    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X_claim = []
    X_sents = []
    y = []
    count = 0

    with open(datapath, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            num_sampling = num_sample
            if line['label'].upper() == "NOT ENOUGH INFO":
                continue
            p_lines = []
            neg_sents = []
            claim = line['claim']
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            sampled_sents_idx = [(id, number)
                                 for id, number in line['predicted_sentences']]
            sampled_sents_idx = sampled_sents_idx[0:k + 5]
            sampled_sents_idx = [
                index for index in sampled_sents_idx
                if index not in evidence_set
            ]
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                elif (doc_line[1], doc_line[2]) in sampled_sents_idx:
                    neg_sents.append(doc_line[0])
                elif (doc_line[1], doc_line[2]) in evidence_set:
                    X_claim.append(claim)
                    X_sents.append(doc_line[0])
                    y.append(1)

            if len(sampled_sents_idx) < num_sample:
                count += 1
                num_sampling = len(sampled_sents_idx)

            samples = random.sample(neg_sents, num_sampling)
            for neg_example in samples:
                X_claim.append(claim)
                X_sents.append(neg_example)
                y.append(0)
        print(count)

    return X_claim, X_sents, y
예제 #28
0
class Doc_Retrieval:

    def __init__(self, database_path, add_claim=False, k_wiki_results=None):
        self.db = FeverDocDB(database_path)
        self.add_claim = add_claim
        self.k_wiki_results = k_wiki_results
        self.proter_stemm = nltk.PorterStemmer()
        self.tokenizer = nltk.word_tokenize
        #print("Va a descargar")
        #get_spacy_model('en_core_web_lg',True,False,False)
        self.predictor = Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
        #print("Descargó")

    def get_NP(self, tree, nps):

        if isinstance(tree, dict):
            if "children" not in tree:
                if tree['nodeType'] == "NP":
                    # print(tree['word'])
                    # print(tree)
                    nps.append(tree['word'])
            elif "children" in tree:
                if tree['nodeType'] == "NP":
                    # print(tree['word'])
                    nps.append(tree['word'])
                    self.get_NP(tree['children'], nps)
                else:
                    self.get_NP(tree['children'], nps)
        elif isinstance(tree, list):
            for sub_tree in tree:
                self.get_NP(sub_tree, nps)

        return nps

    def get_subjects(self, tree):
        subject_words = []
        subjects = []
        for subtree in tree['children']:
            if subtree['nodeType'] == "VP" or subtree['nodeType'] == 'S' or subtree['nodeType'] == 'VBZ':
                subjects.append(' '.join(subject_words))
                subject_words.append(subtree['word'])
            else:
                subject_words.append(subtree['word'])
        return subjects

    def get_noun_phrases(self, line):

        claim = line['claim']
        tokens = self.predictor.predict(claim)
        nps = []
        tree = tokens['hierplane_tree']['root']
        noun_phrases = self.get_NP(tree, nps)
        subjects = self.get_subjects(tree)
        for subject in subjects:
            if len(subject) > 0:
                noun_phrases.append(subject)
        if self.add_claim:
            noun_phrases.append(claim)
        return list(set(noun_phrases))

    def get_doc_for_claim(self, noun_phrases):

        predicted_pages = []
        for np in noun_phrases:
            if len(np) > 300:
                continue
            i = 1
            while i < 12:
                try:
                    docs = wikipedia.search(np)
                    if self.k_wiki_results is not None:
                        predicted_pages.extend(docs[:self.k_wiki_results])
                    else:
                        predicted_pages.extend(docs)
                except (ConnectionResetError, ConnectionError, ConnectionAbortedError, ConnectionRefusedError):
                    print("Connection reset error received! Trial #" + str(i))
                    time.sleep(600 * i)
                    i += 1
                else:
                    break

            # sleep_num = random.uniform(0.1,0.7)
            # time.sleep(sleep_num)
        predicted_pages = set(predicted_pages)
        processed_pages = []
        for page in predicted_pages:
            page = page.replace(" ", "_")
            page = page.replace("(", "-LRB-")
            page = page.replace(")", "-RRB-")
            page = page.replace(":", "-COLON-")
            processed_pages.append(page)

        return processed_pages

    def np_conc(self, noun_phrases):

        noun_phrases = set(noun_phrases)
        predicted_pages = []
        for np in noun_phrases:
            page = np.replace('( ', '-LRB-')
            page = page.replace(' )', '-RRB-')
            page = page.replace(' - ', '-')
            page = page.replace(' :', '-COLON-')
            page = page.replace(' ,', ',')
            page = page.replace(" 's", "'s")
            page = page.replace(' ', '_')

            if len(page) < 1:
                continue
            doc_lines = self.db.get_doc_lines(page)
            if doc_lines is not None:
                predicted_pages.append(page)
        return predicted_pages

    def exact_match(self, line):

        noun_phrases = self.get_noun_phrases(line)
        wiki_results = self.get_doc_for_claim(noun_phrases)
        wiki_results = list(set(wiki_results))

        claim = normalize(line['claim'])
        claim = claim.replace(".", "")
        claim = claim.replace("-", " ")
        words = [self.proter_stemm.stem(word.lower()) for word in self.tokenizer(claim)]
        words = set(words)
        predicted_pages = self.np_conc(noun_phrases)

        for page in wiki_results:
            page = normalize(page)
            processed_page = re.sub("-LRB-.*?-RRB-", "", page)
            processed_page = re.sub("_", " ", processed_page)
            processed_page = re.sub("-COLON-", ":", processed_page)
            processed_page = processed_page.replace("-", " ")
            processed_page = processed_page.replace("–", " ")
            processed_page = processed_page.replace(".", "")
            page_words = [self.proter_stemm.stem(word.lower()) for word in self.tokenizer(processed_page) if
                          len(word) > 0]

            if all([item in words for item in page_words]):
                if ':' in page:
                    page = page.replace(":", "-COLON-")
                predicted_pages.append(page)
        predicted_pages = list(set(predicted_pages))
        # print("claim: ",claim)
        # print("nps: ",noun_phrases)
        # print("wiki_results: ",wiki_results)
        # print("predicted_pages: ",predicted_pages)
        # print("evidence:",line['evidence'])
        return noun_phrases, wiki_results, predicted_pages
예제 #29
0
                        type=str,
                        default='simple',
                        help=("String option specifying tokenizer type to use "
                              "(e.g. 'corenlp')"))

    parser.add_argument('--num-workers',
                        type=int,
                        default=None,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()
    doc_freqs = None
    if args.use_precomputed:
        _, metadata = utils.load_sparse_csr(args.model)
        doc_freqs = metadata['doc_freqs'].squeeze()

    db = FeverDocDB("data/fever/fever.db")
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    jlr = JSONLineReader()

    with open(args.in_file, "r") as f, open(
            "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format(
                args.split, args.max_page, args.max_sent,
                "precomputed" if args.use_precomputed else "not_precomputed"),
            "w+") as out_file:
        lines = jlr.process(f)
        #lines = tf_idf_claims_batch(lines)

        for line in tqdm(lines):
            line = tf_idf_claim(line)
예제 #30
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    while True:

        claim = input("enter claim (or q to quit) >>")
        if claim.lower() == "q":
            break

        ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

        p_lines = []
        pages, _ = ranker.closest_docs(claim, 5)

        for page in pages:
            lines = db.get_doc_lines(page)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        scores = tf_idf_sim(claim, [pl[0] for pl in p_lines])
        scores = list(
            zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines],
                [pl[0] for pl in p_lines]))
        scores = list(filter(lambda score: len(score[3].strip()), scores))
        sentences_l = list(
            sorted(scores, reverse=True, key=lambda elem: elem[0]))

        sentences = [s[3] for s in sentences_l[:5]]
        evidence = " ".join(sentences)

        print("Best pages: {0}".format(repr(pages)))

        print("Evidence:")
        for idx, sentence in enumerate(sentences_l[:5]):
            print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0],
                                                sentence[1], sentence[3]))

        item = reader.text_to_instance(evidence, claim)

        prediction = model.forward_on_instance(item, args.cuda_device)
        cls = model.vocab._index_to_token["labels"][np.argmax(
            prediction["label_probs"])]
        print("PREDICTED: {0}".format(cls))
        print()