def preprocess_eval_data(filename, output_msgpack): EvalData = process_jsonlines(filename) filename = os.path.basename(filename) log.info(filename + ' flattened.') EvalDataP_iter = (pre_proc(p) for p in EvalData.P) EvalDataH_iter = (pre_proc(h) for h in EvalData.H) EvalDataP_docs = [ doc for doc in nlp.pipe( EvalDataP_iter, batch_size=64, n_threads=args.threads) ] EvalDataH_docs = [ doc for doc in nlp.pipe( EvalDataH_iter, batch_size=64, n_threads=args.threads) ] # tokens EvalDataP_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataP_docs] EvalDataH_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataH_docs] log.info('All tokens for ' + filename + ' are obtained.') # features EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen( EvalDataP_docs, EvalDataH_docs) EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen( EvalDataH_docs, EvalDataP_docs) log.info('features for ' + filename + ' is generated.') def build_EvalData_vocab(A, B): # most vocabulary comes from tr_vocab existing_vocab = set(tr_vocab) new_vocab = list( set([ w for doc in A + B for w in doc if w not in existing_vocab and w in glove_vocab ])) vocab = tr_vocab + new_vocab log.info('train vocab {0}, total vocab {1}'.format( len(tr_vocab), len(vocab))) return vocab # vocab EvalData_vocab = build_EvalData_vocab( EvalDataP_tokens, EvalDataH_tokens) # tr_vocab is a subset of EvalData_vocab EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1) EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1) # tags EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag) EvalDataH_tag_ids = token2id(EvalDataH_tags, vocab_tag) # vocab_tag same as training # entities EvalDataP_ent_ids = token2id(EvalDataP_ents, vocab_ent) # vocab_ent same as training EvalDataH_ent_ids = token2id(EvalDataH_ents, vocab_ent) # vocab_ent same as training log.info('vocabulary for ' + filename + ' is built.') EvalData_embedding = build_embedding( wv_file, EvalData_vocab, wv_dim) # tr_embedding is a submatrix of EvalData_embedding log.info('got embedding matrix for ' + filename) result = { 'premise_ids': EvalDataP_ids, 'premise_features': EvalDataP_features, # exact match, tf 'premise_tags': EvalDataP_tag_ids, # POS tagging 'premise_ents': EvalDataP_ent_ids, # Entity recognition 'hypothesis_ids': EvalDataH_ids, 'hypothesis_features': EvalDataH_features, # exact match, tf 'hypothesis_tags': EvalDataH_tag_ids, # POS tagging 'hypothesis_ents': EvalDataH_ent_ids, # Entity recognition 'vocab': EvalData_vocab, 'embedding': EvalData_embedding.tolist(), 'answers': EvalData.label } with open(output_msgpack, 'wb') as f: msgpack.dump(result, f) log.info('saved ' + output_msgpack + ' to disk.')
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S') log = logging.getLogger(__name__) log.info('start data preparing... (using {} threads)'.format(args.threads)) glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary log.info('glove loaded.') #=============================================================== #=================== Work on training data ===================== #=============================================================== train = process_jsonlines(trn_file) log.info('train jsonline data flattened.') trP_iter = (pre_proc(p) for p in train.P) trH_iter = (pre_proc(h) for h in train.H) trP_docs = [ doc for doc in nlp.pipe(trP_iter, batch_size=64, n_threads=args.threads) ] trH_docs = [ doc for doc in nlp.pipe(trH_iter, batch_size=64, n_threads=args.threads) ] # tokens trP_tokens = [[normalize_text(w.text) for w in doc] for doc in trP_docs] trH_tokens = [[normalize_text(w.text) for w in doc] for doc in trH_docs] log.info('All tokens for training are obtained.')