def build_bert_passage_x(query, passage): """ :param query: :param passage: a list of sentences :return: """ query_tokens = dataset_parser.parse_query(query) passage_tokens = [] seg_indices = - np.ones((config_model['ns_passage'], 2), dtype=np.int32) start = len(query_tokens) + 2 if len(passage) > config_model['ns_passage']: raise ValueError('Invalid #sents: {}'.format(len(passage))) for sent_idx, sent in enumerate(passage): sent_tokens = dataset_parser.sent2words(sent)[:config_model['max_nw_sent']] passage_tokens.extend(sent_tokens) end = start + len(sent_tokens) seg_indices[sent_idx, 0] = start seg_indices[sent_idx, 1] = end start = end seg_indices = np.array(seg_indices, dtype=np.int32) passage_in = _build_bert_tokens_for_passage(query_tokens, passage_tokens) res = { **passage_in, 'seg_indices': seg_indices, } return res
def build_bert_x(query, doc_fp, window=None): # prep resources: query and document query_tokens = dataset_parser.parse_query(query) doc_res = dataset_parser.parse_doc2sents(doc_fp) in_size = [config_model['max_ns_doc'], config_model['max_n_tokens']] token_ids = np.zeros(in_size, dtype=np.int32) seg_ids = np.zeros(in_size, dtype=np.int32) token_masks = np.zeros(in_size, dtype=np.float32) # concat sentence with query for sent_idx in range(doc_res['sents']): instance_tokens = build_instance_tokens_with_context(sent_idx, doc_sents=doc_res['sents'], window=window) sent_in = _build_bert_tokens_for_passage(query_tokens=query_tokens, instance_tokens=instance_tokens) token_ids[sent_idx] = sent_in['token_ids'] seg_ids[sent_idx] = sent_in['seg_ids'] token_masks[sent_idx] = sent_in['token_masks'] xx = { 'token_ids': token_ids, 'seg_ids': seg_ids, 'token_masks': token_masks, 'doc_masks': doc_res['doc_masks'], } return xx
def build_bert_x_sep(query, doc_fp): # todo: move initial sentence masks here for query and paras # build query x max_n_query_tokens = config_model['max_n_query_tokens'] query_res = dataset_parser.parse_query(query) query_bert_in = _build_bert_tokens(words=query_res['words'], max_n_tokens=max_n_query_tokens) # build para x doc_res = dataset_parser.parse_doc(doc_fp, concat_paras=False, offset=1) # init paras arrays max_n_article_paras = config_model['max_n_article_paras'] max_n_para_sents = config_model['max_n_para_sents'] max_n_para_tokens = config_model['max_n_para_tokens'] basic_para_size = [max_n_article_paras, max_n_para_tokens] para_token_ids = np.zeros(basic_para_size, dtype=np.int32) para_seg_ids = np.zeros(basic_para_size, dtype=np.int32) para_token_masks = np.zeros(basic_para_size) # init sentence and para masks para_sent_masks = np.zeros([max_n_article_paras, max_n_para_sents, max_n_para_tokens], dtype=np.float32) para_masks = np.zeros([max_n_article_paras, max_n_para_sents], dtype=np.float32) # build para for para_idx, para_res in enumerate(doc_res['paras']): # bert inputs para_bert_in = _build_bert_tokens(words=para_res['words'], max_n_tokens=max_n_para_tokens) para_token_ids[para_idx] = para_bert_in['token_ids'] para_seg_ids[para_idx] = para_bert_in['seg_ids'] para_token_masks[para_idx] = para_bert_in['token_masks'] # masks para_sent_masks[para_idx] = para_res['sent_mask'] para_masks[para_idx] = para_res['para_mask'] xx = { 'query_token_ids': query_bert_in['token_ids'], 'query_seg_ids': query_bert_in['seg_ids'], 'query_token_masks': query_bert_in['token_masks'], 'query_sent_masks': query_res['sent_mask'], 'query_masks': query_res['para_mask'], 'para_token_ids': para_token_ids, 'para_seg_ids': para_seg_ids, 'para_token_masks': para_token_masks, 'para_sent_masks': para_sent_masks, 'para_masks': para_masks, 'doc_masks': doc_res['doc_masks'], } return xx
def build_query(query): query_tokens = dataset_parser.parse_query(query) return _build_bert_in(query_tokens)
def build_bert_sentence_x(query, sentence): query_tokens = dataset_parser.parse_query(query) instance_tokens = dataset_parser.sent2words( sentence)[:config_model['max_nw_sent']] return _build_bert_tokens_for_sent(query_tokens, instance_tokens)
def build_bert_x(query, doc_fp): # prep resources: query and document query_res = dataset_parser.parse_query(query) para_offset = len( query_res['words']) + 2 # 2 additional tokens for CLS and SEP doc_res = dataset_parser.parse_doc(doc_fp, concat_paras=False, offset=para_offset) # init arrays # token_ids = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32) # seg_ids = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32) # token_masks = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32) token_ids = np.zeros( [config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.int32) seg_ids = np.zeros( [config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.int32) token_masks = np.zeros( [config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32) query_sent_masks = np.zeros([ config_model['max_n_article_paras'], config_model['max_n_query_sents'], config_model['max_n_tokens'] ], dtype=np.float32) para_sent_masks = np.zeros([ config_model['max_n_article_paras'], config_model['max_n_para_sents'], config_model['max_n_tokens'] ], dtype=np.float32) para_masks = np.zeros([ config_model['max_n_article_paras'], config_model['max_n_para_sents'] ], dtype=np.float32) # concat paras with query for para_idx, para_res in enumerate(doc_res['paras']): # input tokens para_in = _build_bert_tokens_for_para(query_words=query_res['words'], para_words=para_res['words']) token_ids[para_idx] = para_in['token_ids'] seg_ids[para_idx] = para_in['seg_ids'] token_masks[para_idx] = para_in['token_masks'] # masks query_sent_masks[para_idx] = query_res['sent_mask'] para_sent_masks[para_idx] = para_res['sent_mask'] para_masks[para_idx] = para_res['para_mask'] xx = { 'token_ids': token_ids, 'seg_ids': seg_ids, 'token_masks': token_masks, 'query_sent_masks': query_sent_masks, 'query_masks': query_res['para_mask'], 'para_sent_masks': para_sent_masks, 'para_masks': para_masks, 'doc_masks': doc_res['doc_masks'], } return xx