예제 #1
0
def build_bert_passage_x(query, passage):
    """

    :param query:
    :param passage: a list of sentences
    :return:
    """
    query_tokens = dataset_parser.parse_query(query)

    passage_tokens = []

    seg_indices = - np.ones((config_model['ns_passage'], 2), dtype=np.int32)
    start = len(query_tokens) + 2

    if len(passage) > config_model['ns_passage']:
        raise ValueError('Invalid #sents: {}'.format(len(passage)))

    for sent_idx, sent in enumerate(passage):
        sent_tokens = dataset_parser.sent2words(sent)[:config_model['max_nw_sent']]
        passage_tokens.extend(sent_tokens)

        end = start + len(sent_tokens)
        seg_indices[sent_idx, 0] = start
        seg_indices[sent_idx, 1] = end
        start = end

    seg_indices = np.array(seg_indices, dtype=np.int32)
    passage_in = _build_bert_tokens_for_passage(query_tokens, passage_tokens)

    res = {
        **passage_in,
        'seg_indices': seg_indices,
    }

    return res
예제 #2
0
def build_bert_x(query, doc_fp, window=None):
    # prep resources: query and document
    query_tokens = dataset_parser.parse_query(query)

    doc_res = dataset_parser.parse_doc2sents(doc_fp)
    in_size = [config_model['max_ns_doc'], config_model['max_n_tokens']]
    token_ids = np.zeros(in_size, dtype=np.int32)
    seg_ids = np.zeros(in_size, dtype=np.int32)
    token_masks = np.zeros(in_size, dtype=np.float32)

    # concat sentence with query
    for sent_idx in range(doc_res['sents']):
        instance_tokens = build_instance_tokens_with_context(sent_idx,
                                                             doc_sents=doc_res['sents'],
                                                             window=window)
        sent_in = _build_bert_tokens_for_passage(query_tokens=query_tokens, instance_tokens=instance_tokens)
        token_ids[sent_idx] = sent_in['token_ids']
        seg_ids[sent_idx] = sent_in['seg_ids']
        token_masks[sent_idx] = sent_in['token_masks']

    xx = {
        'token_ids': token_ids,
        'seg_ids': seg_ids,
        'token_masks': token_masks,
        'doc_masks': doc_res['doc_masks'],
    }

    return xx
예제 #3
0
def build_bert_x_sep(query, doc_fp):
    # todo: move initial sentence masks here for query and paras
    # build query x
    max_n_query_tokens = config_model['max_n_query_tokens']
    query_res = dataset_parser.parse_query(query)
    query_bert_in = _build_bert_tokens(words=query_res['words'], max_n_tokens=max_n_query_tokens)

    # build para x
    doc_res = dataset_parser.parse_doc(doc_fp, concat_paras=False, offset=1)
    # init paras arrays
    max_n_article_paras = config_model['max_n_article_paras']
    max_n_para_sents = config_model['max_n_para_sents']
    max_n_para_tokens = config_model['max_n_para_tokens']
    basic_para_size = [max_n_article_paras, max_n_para_tokens]

    para_token_ids = np.zeros(basic_para_size, dtype=np.int32)
    para_seg_ids = np.zeros(basic_para_size, dtype=np.int32)
    para_token_masks = np.zeros(basic_para_size)

    # init sentence and para masks
    para_sent_masks = np.zeros([max_n_article_paras, max_n_para_sents, max_n_para_tokens],  dtype=np.float32)
    para_masks = np.zeros([max_n_article_paras, max_n_para_sents], dtype=np.float32)

    # build para
    for para_idx, para_res in enumerate(doc_res['paras']):
        # bert inputs
        para_bert_in = _build_bert_tokens(words=para_res['words'], max_n_tokens=max_n_para_tokens)
        para_token_ids[para_idx] = para_bert_in['token_ids']
        para_seg_ids[para_idx] = para_bert_in['seg_ids']
        para_token_masks[para_idx] = para_bert_in['token_masks']
        # masks
        para_sent_masks[para_idx] = para_res['sent_mask']
        para_masks[para_idx] = para_res['para_mask']

    xx = {
        'query_token_ids': query_bert_in['token_ids'],
        'query_seg_ids': query_bert_in['seg_ids'],
        'query_token_masks': query_bert_in['token_masks'],
        'query_sent_masks': query_res['sent_mask'],
        'query_masks': query_res['para_mask'],

        'para_token_ids': para_token_ids,
        'para_seg_ids': para_seg_ids,
        'para_token_masks': para_token_masks,
        'para_sent_masks': para_sent_masks,
        'para_masks': para_masks,
        'doc_masks': doc_res['doc_masks'],
    }

    return xx
예제 #4
0
def build_query(query):
    query_tokens = dataset_parser.parse_query(query)
    return _build_bert_in(query_tokens)
예제 #5
0
def build_bert_sentence_x(query, sentence):
    query_tokens = dataset_parser.parse_query(query)
    instance_tokens = dataset_parser.sent2words(
        sentence)[:config_model['max_nw_sent']]
    return _build_bert_tokens_for_sent(query_tokens, instance_tokens)
예제 #6
0
def build_bert_x(query, doc_fp):
    # prep resources: query and document
    query_res = dataset_parser.parse_query(query)

    para_offset = len(
        query_res['words']) + 2  # 2 additional tokens for CLS and SEP
    doc_res = dataset_parser.parse_doc(doc_fp,
                                       concat_paras=False,
                                       offset=para_offset)

    # init arrays
    # token_ids = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32)
    # seg_ids = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32)
    # token_masks = np.zeros([config_model['max_n_article_paras'], config_model['max_n_tokens']], dtype=np.float32)

    token_ids = np.zeros(
        [config_model['max_n_article_paras'], config_model['max_n_tokens']],
        dtype=np.int32)
    seg_ids = np.zeros(
        [config_model['max_n_article_paras'], config_model['max_n_tokens']],
        dtype=np.int32)
    token_masks = np.zeros(
        [config_model['max_n_article_paras'], config_model['max_n_tokens']],
        dtype=np.float32)

    query_sent_masks = np.zeros([
        config_model['max_n_article_paras'], config_model['max_n_query_sents'],
        config_model['max_n_tokens']
    ],
                                dtype=np.float32)

    para_sent_masks = np.zeros([
        config_model['max_n_article_paras'], config_model['max_n_para_sents'],
        config_model['max_n_tokens']
    ],
                               dtype=np.float32)

    para_masks = np.zeros([
        config_model['max_n_article_paras'], config_model['max_n_para_sents']
    ],
                          dtype=np.float32)

    # concat paras with query
    for para_idx, para_res in enumerate(doc_res['paras']):
        # input tokens
        para_in = _build_bert_tokens_for_para(query_words=query_res['words'],
                                              para_words=para_res['words'])
        token_ids[para_idx] = para_in['token_ids']
        seg_ids[para_idx] = para_in['seg_ids']
        token_masks[para_idx] = para_in['token_masks']

        # masks
        query_sent_masks[para_idx] = query_res['sent_mask']
        para_sent_masks[para_idx] = para_res['sent_mask']
        para_masks[para_idx] = para_res['para_mask']

    xx = {
        'token_ids': token_ids,
        'seg_ids': seg_ids,
        'token_masks': token_masks,
        'query_sent_masks': query_sent_masks,
        'query_masks': query_res['para_mask'],
        'para_sent_masks': para_sent_masks,
        'para_masks': para_masks,
        'doc_masks': doc_res['doc_masks'],
    }

    return xx