Пример #1
0
def preprocess_eval_data(filename, output_msgpack):
    EvalData = process_jsonlines(filename)

    filename = os.path.basename(filename)
    log.info(filename + ' flattened.')

    EvalDataP_iter = (pre_proc(p) for p in EvalData.P)
    EvalDataH_iter = (pre_proc(h) for h in EvalData.H)
    EvalDataP_docs = [
        doc for doc in nlp.pipe(
            EvalDataP_iter, batch_size=64, n_threads=args.threads)
    ]
    EvalDataH_docs = [
        doc for doc in nlp.pipe(
            EvalDataH_iter, batch_size=64, n_threads=args.threads)
    ]

    # tokens
    EvalDataP_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataP_docs]
    EvalDataH_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataH_docs]
    log.info('All tokens for ' + filename + ' are obtained.')

    # features
    EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen(
        EvalDataP_docs, EvalDataH_docs)
    EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen(
        EvalDataH_docs, EvalDataP_docs)
    log.info('features for ' + filename + ' is generated.')

    def build_EvalData_vocab(A, B):  # most vocabulary comes from tr_vocab
        existing_vocab = set(tr_vocab)
        new_vocab = list(
            set([
                w for doc in A + B for w in doc
                if w not in existing_vocab and w in glove_vocab
            ]))
        vocab = tr_vocab + new_vocab
        log.info('train vocab {0}, total vocab {1}'.format(
            len(tr_vocab), len(vocab)))
        return vocab

    # vocab
    EvalData_vocab = build_EvalData_vocab(
        EvalDataP_tokens,
        EvalDataH_tokens)  # tr_vocab is a subset of EvalData_vocab
    EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1)
    EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1)

    # tags
    EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag)
    EvalDataH_tag_ids = token2id(EvalDataH_tags,
                                 vocab_tag)  # vocab_tag same as training

    # entities
    EvalDataP_ent_ids = token2id(EvalDataP_ents,
                                 vocab_ent)  # vocab_ent same as training
    EvalDataH_ent_ids = token2id(EvalDataH_ents,
                                 vocab_ent)  # vocab_ent same as training
    log.info('vocabulary for ' + filename + ' is built.')

    EvalData_embedding = build_embedding(
        wv_file, EvalData_vocab,
        wv_dim)  # tr_embedding is a submatrix of EvalData_embedding
    log.info('got embedding matrix for ' + filename)

    result = {
        'premise_ids': EvalDataP_ids,
        'premise_features': EvalDataP_features,  # exact match, tf
        'premise_tags': EvalDataP_tag_ids,  # POS tagging
        'premise_ents': EvalDataP_ent_ids,  # Entity recognition
        'hypothesis_ids': EvalDataH_ids,
        'hypothesis_features': EvalDataH_features,  # exact match, tf
        'hypothesis_tags': EvalDataH_tag_ids,  # POS tagging
        'hypothesis_ents': EvalDataH_ent_ids,  # Entity recognition
        'vocab': EvalData_vocab,
        'embedding': EvalData_embedding.tolist(),
        'answers': EvalData.label
    }
    with open(output_msgpack, 'wb') as f:
        msgpack.dump(result, f)

    log.info('saved ' + output_msgpack + ' to disk.')
Пример #2
0
logging.basicConfig(format='%(asctime)s %(message)s',
                    level=logging.DEBUG,
                    datefmt='%m/%d/%Y %I:%M:%S')
log = logging.getLogger(__name__)

log.info('start data preparing... (using {} threads)'.format(args.threads))

glove_vocab = load_glove_vocab(wv_file, wv_dim)  # return a "set" of vocabulary
log.info('glove loaded.')

#===============================================================
#=================== Work on training data =====================
#===============================================================

train = process_jsonlines(trn_file)
log.info('train jsonline data flattened.')

trP_iter = (pre_proc(p) for p in train.P)
trH_iter = (pre_proc(h) for h in train.H)
trP_docs = [
    doc for doc in nlp.pipe(trP_iter, batch_size=64, n_threads=args.threads)
]
trH_docs = [
    doc for doc in nlp.pipe(trH_iter, batch_size=64, n_threads=args.threads)
]

# tokens
trP_tokens = [[normalize_text(w.text) for w in doc] for doc in trP_docs]
trH_tokens = [[normalize_text(w.text) for w in doc] for doc in trH_docs]
log.info('All tokens for training are obtained.')