示例#1
0
 def get_batch(index, contexts, start_token):
     "make ICT batch data"
     sentence = [contexts[i]["sentence"]
                 for i in index]  # get sentences of paragraphs
     target_sentence = [
         random.randint(0,
                        len(sen) - 1) for sen in sentence
     ]  # set target sentence for ICT training
     remove_target = [
         random.random() < (1 - config.remove_percent)
         for _ in range(len(target_sentence))
     ]  # determine removal of original sentence as mention in paper
     target_context = [
         sen[:i] + sen[i + remove:]
         for i, sen, remove in zip(target_sentence, sentence, remove_target)
     ]  # set sentences of target context
     target_context = [[y for x in context for y in x]
                       for context in target_context
                       ]  # concat sentences of context
     target_context = [[start_token] + context
                       for context in target_context]
     target_sentence = [sen[i] for i, sen in zip(target_sentence, sentence)]
     target_sentence = [[start_token] + sen for sen in target_sentence]
     s, s_mask = util.pad_sequence(target_sentence,
                                   max_seq=config.max_seq,
                                   device=config.device)  # pad sequence
     c, c_mask = util.pad_sequence(target_context,
                                   max_seq=config.max_seq,
                                   device=config.device)
     return s, s_mask, c, c_mask
示例#2
0
 def get_semantic_sim(model):
     "make semantic embedding of context, question. and get similarity"
     context_embedding = []
     question_embedding = []
     model.eval()
     with torch.no_grad():
         for i in tqdm(range(0, len(contexts), config.test_batch_size)):
             c = [[y for x in context["sentence"] for y in x]
                  for context in contexts[i:i + config.test_batch_size]]
             c, c_mask = util.pad_sequence(c,
                                           max_seq=config.max_seq,
                                           device=config.device)
             c_encode = model(x=c, x_mask=c_mask)
             context_embedding.append(c_encode.detach().cpu().numpy())
         for i in tqdm(range(0, len(q_wordpiece), config.test_batch_size)):
             q = [
                 tokens
                 for tokens in q_wordpiece[i:i + config.test_batch_size]
             ]
             q, q_mask = util.pad_sequence(q,
                                           max_seq=config.max_seq,
                                           device=config.device)
             q_encode = model(x=q, x_mask=q_mask)
             question_embedding.append(q_encode.detach().cpu().numpy())
     context_embedding = np.concatenate(context_embedding, axis=0)
     question_embedding = np.concatenate(question_embedding, axis=0)
     return util.get_sim(question_embedding, context_embedding)
示例#3
0
def preprocess_batch(a_batch):
    global FEATURE_LENGTH, VECTOR_EMBEDDINGS, NB_ARGUMENT_LABELS
    _input_seqs = []
    _output_seqs = []
    _lengths = []
    for _ds in a_batch:
        _lengths.append(len(_ds.seq_features))
    _max_length = max(_lengths)

    for _ds in a_batch:
        _seq = []
        for feature in _ds.seq_features:
            element = []
            for l in feature:
                if isinstance(l, str):
                    element.extend(
                        list(
                            VECTOR_EMBEDDINGS.get(l,
                                                  VECTOR_EMBEDDINGS["_unk_"])))
                else:
                    element.append(l)
            _seq.append(element)

        _input_seqs.append(util.pad_sequence(_seq, _max_length,
                                             FEATURE_LENGTH))
        _output_seqs.append(
            util.pad_sequence(list(_ds.seq_labels), _max_length, 1))
    _input_seqs = np.array(_input_seqs)
    _output_seqs = np.array(_output_seqs)
    _lengths = np.array(_lengths)
    return _input_seqs, _output_seqs, _lengths, _max_length
示例#4
0
def generate_answers(sess, model, dataset, rev_vocab):
    """
    Loop over the dev or test dataset and generate answer.

    Note: output format must be answers[uuid] = "real answer"
    You must provide a string of words instead of just a list, or start and end index

    In main() function we are dumping onto a JSON file

    evaluate.py will take the output JSON along with the original JSON file
    and output a F1 and EM

    You must implement this function in order to submit to Leaderboard.

    :param sess: active TF session
    :param model: a built QASystem model
    :param rev_vocab: this is a list of vocabulary that maps index to actual words
    :return:
    """
    answers = {}
    (context, question, question_uuid_data) = dataset
    context_data = convert_data_to_list(context)
    question_data = convert_data_to_list(question)
    context_padded, context_mask = pad_sequence(context_data,
                                                FLAGS.max_context_len)
    question_padded, question_mask = pad_sequence(question_data,
                                                  FLAGS.max_question_len)
    input_data = vectorize(context_padded, context_mask, question_padded,
                           question_mask)

    minibatch_size = 20
    for start in tqdm(range(0, len(question_uuid_data), minibatch_size),
                      desc="predicting on test"):
        h_s, h_e = model.decode(sess, input_data[start:start + minibatch_size])
        iter_num = min(minibatch_size,
                       len(question_uuid_data) - start - minibatch_size)
        for i in range(iter_num):
            a_s = np.argmax(h_s[i])
            a_e = np.argmax(h_e[i])
            if a_s > a_e:
                k = a_e
                a_e = a_s
                a_s = k

            uuid = question_uuid_data[start + i]
            sample_dataset = [input_data[start + i]]
            context = sample_dataset[0][0]
            predicted_answer = model.formulate_answer(context, rev_vocab, a_s,
                                                      a_e)
            answers[uuid] = predicted_answer
    return answers
示例#5
0
def preprocess_batch(a_batch):
    """
    Preprocess a batch and return input output sequences, actual lengths of each sequence in the batch
    and max length in the batch.
    :param a_batch:
    :return:
    """
    global FEATURE_LENGTH, VECTOR_EMBEDDINGS, NB_ARGUMENT_LABELS, USE_BI_LSTM
    _input_seqs = []
    _output_seqs = []
    _lengths = []
    _input_fw_seqs = []
    _input_bw_seqs = []
    for _ds in a_batch:
        _lengths.append(len(_ds.seq_features))
    _max_length = max(_lengths)

    for _ds in a_batch:
        _seq = []
        for feature in _ds.seq_features:
            element = []
            for l in feature:
                if isinstance(l, str):
                    element.extend(
                        list(
                            VECTOR_EMBEDDINGS.get(l,
                                                  VECTOR_EMBEDDINGS["_unk_"])))
                else:
                    element.append(l)
            _seq.append(element)
        if USE_BI_LSTM:
            _input_fw_seqs.append(
                util.pad_sequence(_seq, _max_length, FEATURE_LENGTH))
            _input_bw_seqs.append(
                util.pad_sequence(list(reversed(_seq)), _max_length,
                                  FEATURE_LENGTH))
        else:
            _input_seqs.append(
                util.pad_sequence(_seq, _max_length, FEATURE_LENGTH))
        _output_seqs.append(
            util.pad_sequence(list(_ds.seq_labels), _max_length, 1))
    _input_seqs = np.array(_input_seqs)
    _input_fw_seqs = np.array(_input_fw_seqs)
    _input_bw_seqs = np.array(_input_bw_seqs)
    _output_seqs = np.array(_output_seqs)
    _lengths = np.array(_lengths)
    if USE_BI_LSTM:
        return _input_fw_seqs, _input_bw_seqs, _output_seqs, _lengths, _max_length
    else:
        return _input_seqs, _output_seqs, _lengths, _max_length
def generate_answers(sess, model, dataset, rev_vocab):
    """
    Loop over the dev or test dataset and generate answer.

    Note: output format must be answers[uuid] = "real answer"
    You must provide a string of words instead of just a list, or start and end index

    In main() function we are dumping onto a JSON file

    evaluate.py will take the output JSON along with the original JSON file
    and output a F1 and EM

    You must implement this function in order to submit to Leaderboard.

    :param sess: active TF session
    :param model: a built QASystem model
    :param rev_vocab: this is a list of vocabulary that maps index to actual words
    :return:
    """
    answers = {}
    (context, question, question_uuid_data) = dataset
    context_data = convert_data_to_list(context)
    question_data = convert_data_to_list(question)
    context_padded, context_mask = pad_sequence(context_data,
                                                FLAGS.max_context_len)
    question_padded, question_mask = pad_sequence(question_data,
                                                  FLAGS.max_question_len)
    input_data = vectorize(context_padded, context_mask, question_padded,
                           question_mask, question_uuid_data)

    batch_size = 32
    num_batches = int(len(input_data) / batch_size) + 1
    prog = Progbar(target=num_batches)
    for i, batch in enumerate(minibatches(input_data, batch_size)):
        a_s_vec, a_e_vec = model.answer(sess, batch)
        prog.update(i + 1)
        for (a_s, a_e, context, uuid) in zip(a_s_vec, a_e_vec, batch[0],
                                             batch[4]):
            if a_s > a_e:
                tmp = a_s
                a_s = a_e
                a_e = tmp
            predicted_answer = model.formulate_answer(context, rev_vocab, a_s,
                                                      a_e)
            answers[uuid] = predicted_answer

    return answers
示例#7
0
def get_inputs(question, evidences, word2idx):
    question_list = []
    evidence_list = []
    q_list = []
    e_list = []
    q_mask_list = []
    e_mask_list = []

    ques, q_len = get_chars(question, word2idx)
    question, q_mask = pad_sequence(ques, param.question_size, word2idx)

    nb_evid = len(evidences)
    for i, e in enumerate(evidences):
        e, e_len = get_chars(e, word2idx)
        if e_len == 0: continue

        other_id = random.randint(0, nb_evid - 1)
        if nb_evid != 1:
            while other_id == i:
                other_id = random.randint(0, nb_evid - 1)
        other_evidence = evidences[other_id]
        other_evidence, _ = get_chars(other_evidence, word2idx)

        q_feat = get_feats(ques, e)
        e_feat = get_feats(other_evidence, e)

        evidence, e_mask = pad_sequence(e, param.evidence_size, word2idx)
        q_tags, _ = pad_sequence(q_feat, param.evidence_size, word2idx)
        e_tags, _ = pad_sequence(e_feat, param.evidence_size, word2idx)

        question_list.append(question)
        evidence_list.append(evidence)
        q_list.append(q_tags)
        e_list.append(e_tags)
        q_mask_list.append(q_mask)
        e_mask_list.append(e_mask)

    question = Variable(torch.LongTensor(question_list)).cuda()
    evidence = Variable(torch.LongTensor(evidence_list)).cuda()
    e_feat = Variable(torch.LongTensor(e_list)).cuda()
    q_feat = Variable(torch.LongTensor(q_list)).cuda()
    q_mask = Variable(torch.ByteTensor(q_mask_list)).cuda()
    e_mask = Variable(torch.ByteTensor(e_mask_list)).cuda()

    return question, evidence, q_mask, e_mask, q_feat, e_feat