def feature_func(sample): query_tokend = NLP(reform_text(sample['question'])) doc_tokend = NLP(reform_text(sample['context'])) # features fea_dict = {} fea_dict['uid'] = sample['uid'] fea_dict['context'] = sample['context'] fea_dict['label'] = sample['label'] fea_dict['query_tok'] = tok_func(query_tokend, vocab) fea_dict['query_pos'] = postag_func(query_tokend, vocab_tag) fea_dict['query_ner'] = nertag_func(query_tokend, vocab_ner) fea_dict['doc_tok'] = tok_func(doc_tokend, vocab) fea_dict['doc_pos'] = postag_func(doc_tokend, vocab_tag) fea_dict['doc_ner'] = nertag_func(doc_tokend, vocab_ner) fea_dict['doc_fea'] = '{}'.format(match_func( query_tokend, doc_tokend)) # json don't support float # convert sentence to elmo input fea_dict['doc_char_ids'] = charids_func(doc_tokend) fea_dict['query_char_ids'] = charids_func(query_tokend) doc_toks = [t.text for t in doc_tokend] start, end, span = build_span(sample['context'], sample['answer'], doc_toks, sample['answer_start'], sample['answer_end'], is_train=is_train) if is_train and (start == -1 or end == -1): return None fea_dict['span'] = span fea_dict['start'] = start fea_dict['end'] = end return fea_dict
def build_data(data, vocab, vocab_tag, vocab_ner, fout, is_train, thread=8, NLP=None, v2_on=False): passages = [reform_text(sample['context']) for sample in data] passage_tokened = [ doc for doc in NLP.pipe(passages, batch_size=1000, n_threads=thread) ] logger.info('Done with document tokenize') question_list = [reform_text(sample['question']) for sample in data] question_tokened = [ question for question in NLP.pipe( question_list, batch_size=1000, n_threads=thread) ] logger.info('Done with query tokenize') dropped_sample = 0 with open(fout, 'w', encoding='utf-8') as writer: for idx, sample in enumerate(data): if idx % 5000 == 0: logger.info('parse {}-th sample'.format(idx)) feat_dict = feature_func(sample, question_tokened[idx], passage_tokened[idx], vocab, vocab_tag, vocab_ner, is_train, v2_on) if feat_dict is not None: writer.write('{}\n'.format(json.dumps(feat_dict))) logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
def feature_func(sample, vocab, vocab_tag, vocab_ner, is_train=True): # TODO: this is too slow, how to make it fast? Specially, spacy 2.x is much slower than 1.x. query_tokend = NLP(reform_text(sample['question'])) doc_tokend = NLP(reform_text(sample['context'])) # features fea_dict = {} fea_dict['uid'] = sample['uid'] fea_dict['doc_tok'] = tok_func(doc_tokend, vocab) fea_dict['doc_pos'] = postag_func(doc_tokend, vocab_tag) fea_dict['doc_ner'] = nertag_func(doc_tokend, vocab_ner) fea_dict['doc_fea'] = '{}'.format(match_func(query_tokend, doc_tokend)) fea_dict['query_fea'] = '{}'.format(match_func(doc_tokend, query_tokend)) doc_toks = [t.text for t in doc_tokend if len(t.text) > 0] query_toks = [t.text for t in query_tokend if len(t.text) > 0] fea_dict['query_tok'] = tok_func(query_tokend, vocab, doc_toks) fea_dict['query_pos'] = postag_func(query_tokend, vocab_tag) fea_dict['query_ner'] = nertag_func(query_tokend, vocab_ner) doc_toks = [t.text for t in doc_tokend] start, end, span = build_span(sample['context'], sample['answer'], doc_toks, sample['answer_start'], sample['answer_end'], is_train=is_train) if is_train and (start == -1 or end == -1): return None if not is_train: fea_dict['context'] = sample['context'] fea_dict['span'] = span fea_dict['start'] = start fea_dict['end'] = end return fea_dict
def build_data(data, vocab, vocab_tag, vocab_ner, n_threads=16): dropped_sample = 0 all_data = [] context = [reform_text(sample['context']) for sample in data] context_parsed = [ doc for doc in NLP.pipe(context, batch_size=10000, n_threads=n_threads) ] query = [reform_text(sample['question']) for sample in data] query_parsed = [ question for question in NLP.pipe(query, batch_size=10000, n_threads=n_threads) ] logger.info('Done with tokenizing') for sample, doc_tokend, query_tokend in tqdm.tqdm(zip( data, context_parsed, query_parsed), total=len(data)): fd = feature_func(sample, doc_tokend, query_tokend, vocab, vocab_tag, vocab_ner) if fd is None: dropped_sample += 1 continue all_data.append(fd) logger.info('Got {} data sample in total {}'.format( len(all_data), len(data))) return all_data
def build_data(data, vocab, vocab_tag, vocab_ner, fout, is_train, thread=16, NLP=None, v2_on=False, bert_tokenizer=None): logger.info('reforming text for passages') passages = [ reform_text(sample['context']) for sample in tqdm.tqdm(data, total=len(data)) ] logger.info('tokenizing text for passages') passage_tokened = [ doc for doc in tqdm.tqdm(NLP.pipe( passages, batch_size=1000, n_threads=thread), total=len(passages)) ] logger.info('Done with document tokenize') passage_bert_tokened = [ bert_tokenizer.tokenize(doc) for doc in tqdm.tqdm(passages, total=len(passages)) ] logger.info('reforming text for questions') question_list = [ reform_text(sample['question']) for sample in tqdm.tqdm(data, total=len(data)) ] logger.info('tokenizing text for questions') question_tokened = [ question for question in tqdm.tqdm(NLP.pipe( question_list, batch_size=1000, n_threads=thread), total=len(question_list)) ] question_bert_tokened = [ bert_tokenizer.tokenize(question) for question in tqdm.tqdm(question_list, total=len(question_list)) ] logger.info('Done with query tokenize') dropped_sample = 0 with open(fout, 'w', encoding='utf-8') as writer: for idx, sample in enumerate(tqdm.tqdm(data, total=len(data))): # if idx % 5000 == 0: logger.info('parse {}-th sample'.format(idx)) feat_dict = feature_func(sample, question_tokened[idx], passage_tokened[idx], vocab, vocab_tag, vocab_ner, is_train, v2_on) feat_dict['doc_bert_ctok'] = passage_bert_tokened[idx] feat_dict['query_bert_ctok'] = question_bert_tokened[idx] if feat_dict is not None: writer.write('{}\n'.format(json.dumps(feat_dict))) logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
def build_vocab(test_data, tr_vocab, n_threads=16): nlp = spacy.load('en', disable=['vectors', 'textcat', 'parser', 'tagger', 'ner']) text = [reform_text(sample['context']) for sample in test_data ] + [reform_text(sample['question']) for sample in test_data] parsed = [ doc for doc in nlp.pipe(text, batch_size=10000, n_threads=n_threads) ] tokens = [w.text for doc in parsed for w in doc if len(w.text) > 0] new_vocab = list( set([w for w in tokens if w not in tr_vocab and w in glove_vocab])) for w in new_vocab: tr_vocab.add(w) return tr_vocab
def build_data(data, vocab, vocab_tag, vocab_ner, fout, is_train, dataset_name='squad'): with open(fout, 'w', encoding='utf-8') as writer: dropped_sample = 0 all_datas = [] all_context = [reform_text(sample['context']) for sample in data] all_query = [reform_text(sample['question']) for sample in data] context_parsed = NLP.pipe(all_context, batch_size=5000, n_threads=n_threads) query_parsed = NLP.pipe(all_query, batch_size=5000, n_threads=n_threads) for sample, doc_tokend, query_tokend in tqdm.tqdm(zip( data, context_parsed, query_parsed), total=len(data)): fd = feature_func(sample, doc_tokend, query_tokend, vocab, vocab_tag, vocab_ner, is_train, dataset_name=dataset_name) if fd is None: dropped_sample += 1 continue all_datas.append(fd) print('writing data. filename=', fout, 'len=', len(data)) for fd in all_datas: writer.write('{}\n'.format(json.dumps(fd))) logger.info('dropped {} in total {}'.format(dropped_sample, len(data)))
def token(sample, key=None): s = sample[key] if clean_on: s = reform_text(s) return [w.text for w in nlp(s) if len(w.text) > 0]
def build_vocab(data, glove_vocab=None, sort_all=False, thread=24, clean_on=False, cl_on=True): if cl_on: nlp = spacy.load('en', disable=['vectors', 'textcat', 'parser']) else: nlp = spacy.load( 'en', disable=['vectors', 'textcat', 'tagger', 'ner', 'parser']) logger.info('Collect vocab/pos counter/ner counter') # docs docs = [reform_text(sample['context']) for sample in data] doc_tokened = [ doc for doc in nlp.pipe(docs, batch_size=10000, n_threads=thread) ] logger.info('Done with doc tokenize') questions = [reform_text(sample['question']) for sample in data] questions_tokened = [ question for question in nlp.pipe(questions, batch_size=10000, n_threads=thread) ] logger.info('Done with question tokenize') tag_counter = Counter() ner_counter = Counter() if sort_all: counter = Counter() merged = doc_tokened + questions_tokened for tokened in tqdm.tqdm(merged, total=len(data)): counter.update([ normalize_text(w.text) for w in tokened if len(normalize_text(w.text)) > 0 ]) if cl_on: tag_counter.update( [w.tag_ for w in tokened if len(w.text) > 0]) ner_counter.update( ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened]) vocab = sorted([w for w in counter if w in glove_vocab], key=counter.get, reverse=True) else: query_counter = Counter() doc_counter = Counter() for tokened in tqdm.tqdm(doc_tokened, total=len(doc_tokened)): doc_counter.update([ normalize_text(w.text) for w in tokened if len(normalize_text(w.text)) > 0 ]) if cl_on: tag_counter.update( [w.tag_ for w in tokened if len(w.text) > 0]) ner_counter.update( ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened]) for tokened in tqdm.tqdm(questions_tokened, total=len(questions_tokened)): query_counter.update([ normalize_text(w.text) for w in tokened if len(normalize_text(w.text)) > 0 ]) if cl_on: tag_counter.update( [w.tag_ for w in tokened if len(w.text) > 0]) ner_counter.update( ['{}_{}'.format(w.ent_type_, w.ent_iob_) for w in tokened]) counter = query_counter + doc_counter # sort query words vocab = sorted([w for w in query_counter if w in glove_vocab], key=query_counter.get, reverse=True) vocab += sorted([ w for w in doc_counter.keys() - query_counter.keys() if w in glove_vocab ], key=counter.get, reverse=True) tag_vocab, ner_vocab = None, None if cl_on: tag_counter = sorted([w for w in tag_counter], key=tag_counter.get, reverse=True) ner_counter = sorted([w for w in ner_counter], key=ner_counter.get, reverse=True) tag_vocab = Vocabulary.build(tag_counter) ner_vocab = Vocabulary.build(ner_counter) logger.info('POS Tag vocab size: {}'.format(len(tag_vocab))) logger.info('NER Tag vocab size: {}'.format(len(ner_vocab))) total = sum(counter.values()) matched = sum(counter[w] for w in vocab) logger.info('Raw vocab size vs vocab in glove: {0}/{1}'.format( len(counter), len(vocab))) logger.info('OOV rate:{0:.4f}={1}/{2}'.format( 100.0 * (total - matched) / total, (total - matched), total)) vocab = Vocabulary.build(vocab) logger.info('final vocab size: {}'.format(len(vocab))) return vocab, tag_vocab, ner_vocab
def extract(data, key=None): if clean_on: all_doc = [reform_text(sample[key]) for sample in data] else: all_doc = [sample[key] for sample in data] return all_doc