Пример #1
0
 def feature_func(sample):
     query_tokend = NLP(reform_text(sample['question']))
     doc_tokend = NLP(reform_text(sample['context']))
     # features
     fea_dict = {}
     fea_dict['uid'] = sample['uid']
     fea_dict['context'] = sample['context']
     fea_dict['label'] = sample['label']
     fea_dict['query_tok'] = tok_func(query_tokend, vocab)
     fea_dict['query_pos'] = postag_func(query_tokend, vocab_tag)
     fea_dict['query_ner'] = nertag_func(query_tokend, vocab_ner)
     fea_dict['doc_tok'] = tok_func(doc_tokend, vocab)
     fea_dict['doc_pos'] = postag_func(doc_tokend, vocab_tag)
     fea_dict['doc_ner'] = nertag_func(doc_tokend, vocab_ner)
     fea_dict['doc_fea'] = '{}'.format(match_func(
         query_tokend, doc_tokend))  # json don't support float
     # convert sentence to elmo input
     fea_dict['doc_char_ids'] = charids_func(doc_tokend)
     fea_dict['query_char_ids'] = charids_func(query_tokend)
     doc_toks = [t.text for t in doc_tokend]
     start, end, span = build_span(sample['context'],
                                   sample['answer'],
                                   doc_toks,
                                   sample['answer_start'],
                                   sample['answer_end'],
                                   is_train=is_train)
     if is_train and (start == -1 or end == -1): return None
     fea_dict['span'] = span
     fea_dict['start'] = start
     fea_dict['end'] = end
     return fea_dict
Пример #2
0
def build_data(data,
               vocab,
               vocab_tag,
               vocab_ner,
               fout,
               is_train,
               thread=8,
               NLP=None,
               v2_on=False):
    passages = [reform_text(sample['context']) for sample in data]
    passage_tokened = [
        doc for doc in NLP.pipe(passages, batch_size=1000, n_threads=thread)
    ]
    logger.info('Done with document tokenize')

    question_list = [reform_text(sample['question']) for sample in data]
    question_tokened = [
        question for question in NLP.pipe(
            question_list, batch_size=1000, n_threads=thread)
    ]
    logger.info('Done with query tokenize')
    dropped_sample = 0
    with open(fout, 'w', encoding='utf-8') as writer:
        for idx, sample in enumerate(data):
            if idx % 5000 == 0: logger.info('parse {}-th sample'.format(idx))
            feat_dict = feature_func(sample, question_tokened[idx],
                                     passage_tokened[idx], vocab, vocab_tag,
                                     vocab_ner, is_train, v2_on)
            if feat_dict is not None:
                writer.write('{}\n'.format(json.dumps(feat_dict)))
    logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
Пример #3
0
def feature_func(sample, vocab, vocab_tag, vocab_ner, is_train=True):
    # TODO: this is too slow, how to make it fast? Specially, spacy 2.x is much slower than 1.x.
    query_tokend = NLP(reform_text(sample['question']))
    doc_tokend = NLP(reform_text(sample['context']))
    # features
    fea_dict = {}
    fea_dict['uid'] = sample['uid']
    fea_dict['doc_tok'] = tok_func(doc_tokend, vocab)
    fea_dict['doc_pos'] = postag_func(doc_tokend, vocab_tag)
    fea_dict['doc_ner'] = nertag_func(doc_tokend, vocab_ner)
    fea_dict['doc_fea'] = '{}'.format(match_func(query_tokend, doc_tokend))
    fea_dict['query_fea'] = '{}'.format(match_func(doc_tokend, query_tokend))
    doc_toks = [t.text for t in doc_tokend if len(t.text) > 0]
    query_toks = [t.text for t in query_tokend if len(t.text) > 0]
    fea_dict['query_tok'] = tok_func(query_tokend, vocab, doc_toks)
    fea_dict['query_pos'] = postag_func(query_tokend, vocab_tag)
    fea_dict['query_ner'] = nertag_func(query_tokend, vocab_ner)
    doc_toks = [t.text for t in doc_tokend]
    start, end, span = build_span(sample['context'],
                                  sample['answer'],
                                  doc_toks,
                                  sample['answer_start'],
                                  sample['answer_end'],
                                  is_train=is_train)
    if is_train and (start == -1 or end == -1): return None
    if not is_train:
        fea_dict['context'] = sample['context']
        fea_dict['span'] = span
    fea_dict['start'] = start
    fea_dict['end'] = end
    return fea_dict
Пример #4
0
def build_data(data, vocab, vocab_tag, vocab_ner, n_threads=16):
    dropped_sample = 0
    all_data = []
    context = [reform_text(sample['context']) for sample in data]
    context_parsed = [
        doc for doc in NLP.pipe(context, batch_size=10000, n_threads=n_threads)
    ]

    query = [reform_text(sample['question']) for sample in data]
    query_parsed = [
        question
        for question in NLP.pipe(query, batch_size=10000, n_threads=n_threads)
    ]
    logger.info('Done with tokenizing')

    for sample, doc_tokend, query_tokend in tqdm.tqdm(zip(
            data, context_parsed, query_parsed),
                                                      total=len(data)):
        fd = feature_func(sample, doc_tokend, query_tokend, vocab, vocab_tag,
                          vocab_ner)
        if fd is None:
            dropped_sample += 1
            continue
        all_data.append(fd)
    logger.info('Got {} data sample in total {}'.format(
        len(all_data), len(data)))
    return all_data
Пример #5
0
def build_data(data,
               vocab,
               vocab_tag,
               vocab_ner,
               fout,
               is_train,
               thread=16,
               NLP=None,
               v2_on=False,
               bert_tokenizer=None):
    logger.info('reforming text for passages')
    passages = [
        reform_text(sample['context'])
        for sample in tqdm.tqdm(data, total=len(data))
    ]
    logger.info('tokenizing text for passages')
    passage_tokened = [
        doc for doc in tqdm.tqdm(NLP.pipe(
            passages, batch_size=1000, n_threads=thread),
                                 total=len(passages))
    ]
    logger.info('Done with document tokenize')
    passage_bert_tokened = [
        bert_tokenizer.tokenize(doc)
        for doc in tqdm.tqdm(passages, total=len(passages))
    ]
    logger.info('reforming text for questions')
    question_list = [
        reform_text(sample['question'])
        for sample in tqdm.tqdm(data, total=len(data))
    ]
    logger.info('tokenizing text for questions')
    question_tokened = [
        question for question in tqdm.tqdm(NLP.pipe(
            question_list, batch_size=1000, n_threads=thread),
                                           total=len(question_list))
    ]
    question_bert_tokened = [
        bert_tokenizer.tokenize(question)
        for question in tqdm.tqdm(question_list, total=len(question_list))
    ]
    logger.info('Done with query tokenize')
    dropped_sample = 0
    with open(fout, 'w', encoding='utf-8') as writer:
        for idx, sample in enumerate(tqdm.tqdm(data, total=len(data))):
            # if idx % 5000 == 0: logger.info('parse {}-th sample'.format(idx))
            feat_dict = feature_func(sample, question_tokened[idx],
                                     passage_tokened[idx], vocab, vocab_tag,
                                     vocab_ner, is_train, v2_on)
            feat_dict['doc_bert_ctok'] = passage_bert_tokened[idx]
            feat_dict['query_bert_ctok'] = question_bert_tokened[idx]

            if feat_dict is not None:
                writer.write('{}\n'.format(json.dumps(feat_dict)))
    logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
Пример #6
0
def build_vocab(test_data, tr_vocab, n_threads=16):
    nlp = spacy.load('en',
                     disable=['vectors', 'textcat', 'parser', 'tagger', 'ner'])
    text = [reform_text(sample['context']) for sample in test_data
            ] + [reform_text(sample['question']) for sample in test_data]
    parsed = [
        doc for doc in nlp.pipe(text, batch_size=10000, n_threads=n_threads)
    ]
    tokens = [w.text for doc in parsed for w in doc if len(w.text) > 0]
    new_vocab = list(
        set([w for w in tokens if w not in tr_vocab and w in glove_vocab]))
    for w in new_vocab:
        tr_vocab.add(w)
    return tr_vocab
Пример #7
0
def build_data(data,
               vocab,
               vocab_tag,
               vocab_ner,
               fout,
               is_train,
               dataset_name='squad'):
    with open(fout, 'w', encoding='utf-8') as writer:
        dropped_sample = 0
        all_datas = []
        all_context = [reform_text(sample['context']) for sample in data]
        all_query = [reform_text(sample['question']) for sample in data]
        context_parsed = NLP.pipe(all_context,
                                  batch_size=5000,
                                  n_threads=n_threads)
        query_parsed = NLP.pipe(all_query,
                                batch_size=5000,
                                n_threads=n_threads)

        for sample, doc_tokend, query_tokend in tqdm.tqdm(zip(
                data, context_parsed, query_parsed),
                                                          total=len(data)):
            fd = feature_func(sample,
                              doc_tokend,
                              query_tokend,
                              vocab,
                              vocab_tag,
                              vocab_ner,
                              is_train,
                              dataset_name=dataset_name)
            if fd is None:
                dropped_sample += 1
                continue
            all_datas.append(fd)
        print('writing data. filename=', fout, 'len=', len(data))
        for fd in all_datas:
            writer.write('{}\n'.format(json.dumps(fd)))

        logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
Пример #8
0
 def token(sample, key=None):
     s = sample[key]
     if clean_on:
         s = reform_text(s)
     return [w.text for w in nlp(s) if len(w.text) > 0]
Пример #9
0
def build_vocab(data,
                glove_vocab=None,
                sort_all=False,
                thread=24,
                clean_on=False,
                cl_on=True):
    if cl_on:
        nlp = spacy.load('en', disable=['vectors', 'textcat', 'parser'])
    else:
        nlp = spacy.load(
            'en', disable=['vectors', 'textcat', 'tagger', 'ner', 'parser'])

    logger.info('Collect vocab/pos counter/ner counter')
    # docs
    docs = [reform_text(sample['context']) for sample in data]
    doc_tokened = [
        doc for doc in nlp.pipe(docs, batch_size=10000, n_threads=thread)
    ]
    logger.info('Done with doc tokenize')
    questions = [reform_text(sample['question']) for sample in data]
    questions_tokened = [
        question
        for question in nlp.pipe(questions, batch_size=10000, n_threads=thread)
    ]
    logger.info('Done with question tokenize')

    tag_counter = Counter()
    ner_counter = Counter()
    if sort_all:
        counter = Counter()
        merged = doc_tokened + questions_tokened
        for tokened in tqdm.tqdm(merged, total=len(data)):
            counter.update([
                normalize_text(w.text) for w in tokened
                if len(normalize_text(w.text)) > 0
            ])
            if cl_on:
                tag_counter.update(
                    [w.tag_ for w in tokened if len(w.text) > 0])
                ner_counter.update(
                    ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened])
        vocab = sorted([w for w in counter if w in glove_vocab],
                       key=counter.get,
                       reverse=True)
    else:
        query_counter = Counter()
        doc_counter = Counter()

        for tokened in tqdm.tqdm(doc_tokened, total=len(doc_tokened)):
            doc_counter.update([
                normalize_text(w.text) for w in tokened
                if len(normalize_text(w.text)) > 0
            ])
            if cl_on:
                tag_counter.update(
                    [w.tag_ for w in tokened if len(w.text) > 0])
                ner_counter.update(
                    ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened])

        for tokened in tqdm.tqdm(questions_tokened,
                                 total=len(questions_tokened)):
            query_counter.update([
                normalize_text(w.text) for w in tokened
                if len(normalize_text(w.text)) > 0
            ])
            if cl_on:
                tag_counter.update(
                    [w.tag_ for w in tokened if len(w.text) > 0])
                ner_counter.update(
                    ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened])
        counter = query_counter + doc_counter
        # sort query words
        vocab = sorted([w for w in query_counter if w in glove_vocab],
                       key=query_counter.get,
                       reverse=True)
        vocab += sorted([
            w for w in doc_counter.keys() - query_counter.keys()
            if w in glove_vocab
        ],
                        key=counter.get,
                        reverse=True)
    tag_vocab, ner_vocab = None, None
    if cl_on:
        tag_counter = sorted([w for w in tag_counter],
                             key=tag_counter.get,
                             reverse=True)
        ner_counter = sorted([w for w in ner_counter],
                             key=ner_counter.get,
                             reverse=True)
        tag_vocab = Vocabulary.build(tag_counter)
        ner_vocab = Vocabulary.build(ner_counter)
        logger.info('POS Tag vocab size: {}'.format(len(tag_vocab)))
        logger.info('NER Tag vocab size: {}'.format(len(ner_vocab)))
    total = sum(counter.values())
    matched = sum(counter[w] for w in vocab)
    logger.info('Raw vocab size vs vocab in glove: {0}/{1}'.format(
        len(counter), len(vocab)))
    logger.info('OOV rate:{0:.4f}={1}/{2}'.format(
        100.0 * (total - matched) / total, (total - matched), total))
    vocab = Vocabulary.build(vocab)

    logger.info('final vocab size: {}'.format(len(vocab)))

    return vocab, tag_vocab, ner_vocab
Пример #10
0
 def extract(data, key=None):
     if clean_on:
         all_doc = [reform_text(sample[key]) for sample in data]
     else:
         all_doc = [sample[key] for sample in data]
     return all_doc