Пример #1
0
def refill(batches, fdx, fdy, batch_size, FLAGS, sort_and_shuffle=True):
    line_pairs = []
    linex = fdx.readline()
    # linex, liney = fdx.readline(), fdy.readline()

    while linex:
        y_tokens = tokenize(linex)  # y_tokens are source of truth
        orig_str = "".join(reverse_vocab[x] for x in y_tokens)
        noisy_str = add_noise_to_string(orig_str, 0.2 / FLAGS['max_seq_len'],
                                        FLAGS['max_seq_len'])
        x_tokens = nlc_data.sentence_to_token_ids(
            noisy_str, vocab,
            tokenizer=get_tokenizer(FLAGS))  # x_tokens are noisy str

        if len(x_tokens) < FLAGS['max_seq_len'] and len(
                y_tokens) < FLAGS['max_seq_len']:
            line_pairs.append((x_tokens, y_tokens))
        if len(line_pairs) == batch_size * 16:
            break
        linex = fdx.readline()
        # linex, liney = fdx.readline(), fdy.readline()

    if sort_and_shuffle:
        line_pairs = sorted(line_pairs, key=lambda e: len(e[0]))

    for batch_start in range(0, len(line_pairs), batch_size):
        x_batch, y_batch = zip(*line_pairs[batch_start:batch_start +
                                           batch_size])
        #    if len(x_batch) < batch_size:
        #      break
        batches.append((x_batch, y_batch))

    if sort_and_shuffle:
        random.shuffle(batches)
    return
Пример #2
0
def tokenize(sent, vocab, depth=FLAGS.num_layers):
    align = pow(2, depth - 1)
    token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS))
    ones = [1] * len(token_ids)
    pad = (align - len(token_ids)) % align

    token_ids += [nlc_data.PAD_ID] * pad
    ones += [0] * pad

    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])

    return source, mask
Пример #3
0
def tokenize(sent, vocab, depth=FLAGS.num_layers):
  align = pow(2, depth - 1)
  token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS))
  ones = [1] * len(token_ids)
  pad = (align - len(token_ids)) % align

  token_ids += [nlc_data.PAD_ID] * pad
  ones += [0] * pad

  source = np.array(token_ids).reshape([-1, 1])
  mask = np.array(ones).reshape([-1, 1])

  return source, mask