示例#1
0
def get_batch_generator(word2id, qn_uuid_data, context_token_data,
                        qn_token_data, batch_size, context_len, question_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                           qn_token_data, batch_size, context_len,
                           question_len)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids,
                        question_len)  # pad questions to length question_len
        context_ids = padded(context_ids,
                             context_len)  # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        # Make into a Batch object
        batch = Batch(context_ids,
                      context_mask,
                      context_tokens,
                      qn_ids,
                      qn_mask,
                      qn_tokens=None,
                      ans_span=None,
                      ans_tokens=None,
                      uuids=uuids)

        yield batch

    return
示例#2
0
def custom_get_batch_generator(word2id, body_token_data_list, headline_token_data_list, batch_size, context_len, question_len):
    """
    This function returns a generator object that yields batches.
    The last batch in the dataset will be a partial batch.
    Read this to understand generators and the yield keyword in Python: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      context_file, qn_file, ans_file: paths to {train/dev}.{context/question/answer} data files
      batch_size: int. how big to make the batches
      context_len, question_len: max length of context and question respectively
      discard_long: If True, discard any examples that are longer than context_len or question_len.
        If False, truncate those exmaples instead.
    """
    #body_file, headline_file, ans_file = open(body_path), open(headline_path), open(ans_path)
    batches = []

    while True:
        if len(batches) == 0: # add more batches
            custom_refill_batches(batches, word2id, body_token_data_list, headline_token_data_list, batch_size, context_len, question_len)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (body_ids, headline_ids) = batches.pop(0)

        # Pad context_ids and qn_ids
        headline_ids = padded(headline_ids, question_len) # pad questions to length question_len
        body_ids = padded(body_ids, context_len) # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        headline_ids = np.array(headline_ids) # shape (batch_size, question_len)
        headline_mask = (headline_ids != PAD_ID).astype(np.int32) # shape (batch_size, question_len)

        # Make context_ids into a np array and create context_mask
        body_ids = np.array(body_ids) # shape (batch_size, context_len)
        body_mask = (body_ids != PAD_ID).astype(np.int32) # shape (batch_size, context_len)


        # Make into a Batch object
        #body_ids, body_mask, body_tokens, headline_ids, headline_mask, headline_tokens, ans_span, uuids=None
        batch = CustomBatch(body_ids, body_mask, None, headline_ids, headline_mask, None, None)

        yield batch

    return
def get_batch_generator(word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data, qn_token_data, batch_size, context_len, question_len)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids, question_len) # pad questions to length question_len
        context_ids = padded(context_ids, context_len) # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        # Make into a Batch object
        batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, uuids=uuids)

        yield batch

    return
def get_batch_generator(word2id, qn_uuid_data, context_token_data,
                        qn_token_data, batch_size, context_len, question_len):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences.

    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                           qn_token_data, batch_size, context_len,
                           question_len)
        if len(batches) == 0:
            break

        (uuids, context_tokens, context_ids, qn_ids) = batches.pop(0)

        qn_ids = padded(qn_ids, question_len)
        context_ids = padded(context_ids, context_len)

        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        batch = Batch(context_ids,
                      context_mask,
                      context_tokens,
                      qn_ids,
                      qn_mask,
                      qn_tokens=None,
                      ans_span=None,
                      ans_tokens=None,
                      uuids=uuids)

        yield batch

    return
示例#5
0
def refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                   qn_token_data, batch_size, context_len, question_len,
                   word_len, mcids_dict):
    """
    This is similar to refill_batches in data_batcher.py, but:
      (1) instead of reading from (preprocessed) datafiles, it reads from the provided lists
      (2) it only puts the context and question information in the batches (not the answer information)
      (3) it also gets UUID information and puts it in the batches

    Inputs:
      batches: list to be refilled
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Makes batches that contain:
      uuids_batch, context_tokens_batch, context_ids_batch, qn_ids_batch: all lists length batch_size
    """
    examples = []

    # Get next example
    qn_uuid, context_tokens, qn_tokens = readnext(qn_uuid_data), readnext(
        context_token_data), readnext(qn_token_data)

    pos2int = {"CC":0, "CD":1, "DT":2, "EX":3, "FW":4, "IN":5, "JJ":6, "JJR":7, "JJS":8, \
        "LS":9, "MD":10, "NN":11, "NNS":12, "NNP":13, "NNPS":14, "PDT":15, "POS":16, \
        "PRP":17, "PRP$":18, "RB":19, "RBR":20, "RBS":21, "RP":22, "SYM":23, "TO":24, \
        "UH":25, "VB":26, "VBD":27, "VBG":28, "VBN":29, "VBP":30, "VBZ":31, "WDT":32, \
        "WP":33, "WP$":34, "WRB":35}
    ner2int = {
        "O": 0,
        "PERSON": 1,
        "LOCATION": 2,
        "ORGANIZATION": 3,
        "GSP": 4,
        "GPE": 5,
        "FACILITY": 6
    }
    pos_keys = pos2int.keys()
    ner_keys = ner2int.keys()
    # lemmatizer = WordNetLemmatizer()
    a = 0.4

    char2id = {"a":2, "b":3, "c":4, "d":5, "e":6, "f":7, "g":8, \
        "h":9, "i":10, "j":11, "k":12, "l":13, "m":14, "n":15, "o":16, \
        "p":17, "q":18, "r":19, "s":20, "t":21, "u":22, "v":23, "w":24, \
        "x":25, "y":26, "z":27, "0":28, "1":29, "2":30, "3":31, "4":32, \
        "5":33, "6":34, "7":35, "8":36, "9":37, ".":38, ",":39, '"':40, \
        "?":41, "'":42}
    char_keys = char2id.keys()

    mcids_keys = mcids_dict.keys()

    while qn_uuid and context_tokens and qn_tokens:

        # Convert context_tokens and qn_tokens to context_ids and qn_ids
        context_ids = [word2id.get(w, UNK_ID) for w in context_tokens]
        qn_ids = [word2id.get(w, UNK_ID) for w in qn_tokens]

        ########## GENERATE CHARACTER TOKENS #########################
        char_ids = [[
            char2id[char] if char in char_keys else UNK_ID
            for char in tok.lower()
        ] for tok in context_tokens]
        char_ids = [x[:word_len] for x in char_ids]  # (N, <=word_len)
        char_ids = padded(char_ids, word_len)  # (N, word_len)

        charQ_ids = [[
            char2id[char] if char in char_keys else UNK_ID
            for char in tok.lower()
        ] for tok in qn_tokens]
        charQ_ids = [x[:word_len] for x in charQ_ids]  # (M, <=word_len)
        charQ_ids = padded(charQ_ids, word_len)  # (M, word_len)
        ##############################################################

        ########## GET COMMONQ EMBEDDING INDICES AND MASK ############
        commonQ_mask = [x in mcids_keys for x in qn_ids]  # (M)
        commonQ_emb_indices = [mcids_dict.get(x, 0) for x in qn_ids]  # (M)

        commonC_mask = [x in mcids_keys for x in context_ids]  # (N)
        commonC_emb_indices = [
            mcids_dict.get(x, 0) for x in context_ids
        ]  # (N) - note the 0 index doesnt matter due to mask
        ##############################################################

        ########## GENERATE EXACT MATCH + POS/NER FEATURES ###########
        # calculate POS and NER tags (as strings)
        # pos_tree = pos_tag(context_tokens)
        # pos_tags = [p[1] for p in pos_tree]
        # chunk = ne_chunk(pos_tree)
        # ner_tags = [ne[2][2:] for ne in tree2conlltags(chunk)]

        # convert POS and NER tags to ints using dictionary
        # pos_ids = [pos2int[pos] if pos in pos_keys else -1 for pos in pos_tags]
        # ner_ids = [ner2int[ne]  if ne  in ner_keys else 0  for ne  in ner_tags]

        # compute lemmatized version of each context token
        # lems = [str(lemmatizer.lemmatize(tok,get_wordnet_pos(pos))) if get_wordnet_pos(pos) else str(lemmatizer.lemmatize(tok)) for tok,pos in zip(context_tokens,pos_tags)]

        # compare each context word to query words for three different versions
        match_orig = [
            int(sum([context_token == q for q in qn_tokens]) == 1)
            for context_token in context_tokens
        ]  # original form
        # match_lemma = [int(sum([context_token_lem==q for q in qn_tokens])==1) for context_token_lem in lems]    # lemma form

        # compute normalized term frequency
        fdist = FreqDist(context_tokens)
        max_count = float(max(fdist.values()))
        tf = [a + (1 - a) * fdist[w] / max_count for w in context_tokens]

        # feats = zip(*(pos_ids, ner_ids, match_orig, match_lemma))  # (N,4)
        # feats = zip(*(pos_ids, match_orig, match_lemma))  # (N,3)
        # feats = zip(*(pos_ids, tf, match_orig, match_lemma))  # (N,4)
        feats = zip(*(tf, match_orig))  # (N,4)
        ##############################################################

        # Truncate context_ids and qn_ids
        # Note: truncating context_ids may truncate the correct answer, meaning that it's impossible for your model to get the correct answer on this example!
        if len(qn_ids) > question_len:
            qn_ids = qn_ids[:question_len]
            commonQ_mask = commonQ_mask[:question_len]
            commonQ_emb_indices = commonQ_emb_indices[:question_len]
            charQ_ids = charQ_ids[:question_len]
        if len(context_ids) > context_len:
            context_ids = context_ids[:context_len]
            feats = feats[:context_len]
            char_ids = char_ids[:context_len]
            commonC_mask = commonC_mask[:context_len]
            commonC_emb_indices = commonC_emb_indices[:context_len]

        # Add to list of examples
        examples.append((qn_uuid, context_tokens, context_ids, qn_ids, feats,
                         char_ids, commonQ_mask, commonQ_emb_indices,
                         charQ_ids, commonC_mask, commonC_emb_indices))

        # Stop if you've got a batch
        if len(examples) == batch_size:
            break

        # Get next example
        qn_uuid, context_tokens, qn_tokens = readnext(qn_uuid_data), readnext(
            context_token_data), readnext(qn_token_data)

    # Make into batches
    for batch_start in xrange(0, len(examples), batch_size):
        uuids_batch, context_tokens_batch, context_ids_batch, qn_ids_batch, feats_batch, char_ids_batch, commonQ_mask_batch, commonQ_emb_indices_batch, charQ_ids_batch, commonC_mask_batch, commonC_emb_indices_batch = zip(
            *examples[batch_start:batch_start + batch_size])

        batches.append(
            (uuids_batch, context_tokens_batch, context_ids_batch,
             qn_ids_batch, feats_batch, char_ids_batch, commonQ_mask_batch,
             commonQ_emb_indices_batch, charQ_ids_batch, commonC_mask_batch,
             commonC_emb_indices_batch))

    return
示例#6
0
def get_batch_generator(word2id, qn_uuid_data, context_token_data,
                        qn_token_data, batch_size, context_len, question_len,
                        num_feats, word_len, mcids_dict):
    """
    This is similar to get_batch_generator in data_batcher.py, but with some
    differences (see explanation in refill_batches).

    Inputs:
      word2id: dictionary mapping word (string) to word id (int)
      qn_uuid_data: list of strings that are unique ids
      context_token_data, qn_token_data: list of lists of strings (no UNKs, no padding)
      batch_size: int. size of batches to make
      context_len, question_len: ints. max sizes of context and question. Anything longer is truncated.

    Yields:
      Batch objects, but they only contain context and question information (no answer information)
    """
    batches = []

    while True:
        if len(batches) == 0:
            refill_batches(batches, word2id, qn_uuid_data, context_token_data,
                           qn_token_data, batch_size, context_len,
                           question_len, word_len, mcids_dict)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (uuids, context_tokens, context_ids, qn_ids, feats, char_ids,
         commonQ_mask, commonQ_emb_indices, charQ_ids, commonC_mask,
         commonC_emb_indices) = batches.pop(0)

        # Pad context_ids and qn_ids
        qn_ids = padded(qn_ids,
                        question_len)  # pad questions to length question_len
        context_ids = padded(context_ids,
                             context_len)  # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
        qn_ids = np.array(qn_ids)
        qn_mask = (qn_ids != PAD_ID).astype(np.int32)

        # Make context_ids into a np array and create context_mask
        context_ids = np.array(context_ids)
        context_mask = (context_ids != PAD_ID).astype(np.int32)

        # Make feats into an np array
        feats = np.array(padded2(feats, num_feats, context_len))

        # Pad character ids (first for word length, then for context length), then make into array
        char_ids = padded2(char_ids, word_len, context_len, islist=True)
        char_ids = np.array(char_ids)
        char_mask = (char_ids != PAD_ID).astype(np.int32)

        charQ_ids = padded2(charQ_ids, word_len, question_len, islist=True)
        charQ_ids = np.array(charQ_ids)
        charQ_mask = (charQ_ids != PAD_ID).astype(np.int32)

        # Pad commonQ_mask and commonQ_emb_indices / convert to np.array
        commonQ_mask = np.array(paddedBool(commonQ_mask, question_len))
        commonQ_emb_indices = np.array(
            padded(commonQ_emb_indices, question_len))

        commonC_mask = np.array(paddedBool(commonC_mask, context_len))
        commonC_emb_indices = np.array(padded(commonC_emb_indices,
                                              context_len))

        # Make into a Batch object
        batch = Batch(context_ids, context_mask, context_tokens, qn_ids, qn_mask, qn_tokens=None, ans_span=None, ans_tokens=None, \
            feats=feats, char_ids=char_ids, char_mask=char_mask, commonQ_mask=commonQ_mask, commonQ_emb_indices=commonQ_emb_indices, \
            charQ_ids=charQ_ids, charQ_mask=charQ_mask, commonC_mask=commonC_mask, commonC_emb_indices=commonC_emb_indices, uuids=uuids)

        yield batch

    return