예제 #1
0
def demo_number_filtered_sentence_pairs(src_path, trg_path):
    src_reader = smart_reader(src_path)
    trg_reader = smart_reader(trg_path)
    max_length = 30
    bitext = bitext_reader(src_reader, trg_reader, max_length=max_length)
    num_sentences = sum([1 for _ in bitext])
    print("There are {} sentences with max_length = {}".format(
        num_sentences, max_length))
예제 #2
0
def bitext_reader_demo(src_path, trg_path):
    """Demo of the bitext reader."""

    # create a reader
    src_reader = smart_reader(src_path)
    trg_reader = smart_reader(trg_path)
    bitext = bitext_reader(src_reader, trg_reader)

    # to see that it really works, try this:
    print(next(bitext))
    print(next(bitext))
    print(next(bitext))
    print(next(bitext))
예제 #3
0
def vocabulary_demo():
    # We used up a few lines in the previous example, so we set up
    # our data generator again.
    corpus = smart_reader(train_e_path)

    # Let's create a vocabulary given our (tokenized) corpus
    vocabulary = Vocabulary(corpus=corpus)
    print("Original vocabulary size: {}".format(len(vocabulary)))

    # Now we only keep the highest-frequency words
    vocabulary_size = 1000
    vocabulary.trim(vocabulary_size)
    print("Trimmed vocabulary size: {}".format(len(vocabulary)))

    # Now we can get word indexes using v.get_word_id():
    for t in ["<PAD>", "<UNK>", "the"]:
        print("The index of \"{}\" is: {}".format(t,
                                                  vocabulary.get_token_id(t)))

    # And the inverse too, using v.i2t:
    for i in range(10):
        print("The token with index {} is: {}".format(i,
                                                      vocabulary.get_token(i)))

    # Now let's try to get a word ID for a word not in the vocabulary
    # we should get 1 (so, <UNK>)
    for t in ["!@!_not_in_vocab_!@!"]:
        print("The index of \"{}\" is: {}".format(t,
                                                  vocabulary.get_token_id(t)))
    def __init__(self,
                 model,
                 train_e_path,
                 train_f_path,
                 dev_e_path,
                 dev_f_path,
                 dev_wa,
                 num_epochs=5,
                 batch_size=16,
                 max_length=30,
                 lr=0.1,
                 lr_decay=0.001,
                 model_path="./model.ckpt",
                 session=None):
        """Initialize the trainer with a model."""

        self.model = model
        self.train_e_path = train_e_path
        self.train_f_path = train_f_path
        self.dev_e_path = dev_e_path
        self.dev_f_path = dev_f_path
        self.dev_wa = dev_wa

        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.max_length = max_length
        self.lr = lr
        self.lr_decay = lr_decay
        self.session = session
        self.model_path = model_path

        print("Training with B={} max_length={} lr={} lr_decay={}".format(
            batch_size, max_length, lr, lr_decay))

        self._build_optimizer()

        # This loads the data into memory so that we can easily shuffle it.
        # If this takes too much memory, shuffle the data on disk
        # and use bitext_reader directly.
        self.corpus = list(
            bitext_reader(smart_reader(train_e_path),
                          smart_reader(train_f_path),
                          max_length=max_length))
        self.dev_corpus = list(
            bitext_reader(smart_reader(dev_e_path), smart_reader(dev_f_path)))
예제 #5
0
                                                  vocabulary.get_token_id(t)))


vocabulary_demo()

# Now let's create the vocabularies that we use further on.

# In[18]:

# Using only 1000 words will result in many UNKs, but
# it will make training a lot faster.
# If you have a fast computer, a GPU, or a lot of time,
# try with 10000 instead.
max_tokens = 1000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))
print()


def sample_words(vocabulary, n=5):
    """Print a few words from the vocabulary."""
    for _ in range(n):
        token_id = np.random.randint(0, len(vocabulary) - 1)
예제 #6
0
dev_e_path = '../data/validation/dev.e.gz'
dev_f_path = '../data/validation/dev.f.gz'
dev_wa = '../data/validation/dev.wa.nonullalign'

test_e_path = '../data/test/test.e.gz'
test_f_path = '../data/test/test.f.gz'
test_wa = '../data/test/test.wa.nonullalign'

# Using only 1000 words will result in many UNKs, but
# it will make training a lot faster.
# If you have a fast computer, a GPU, or a lot of time,
# try with 10000 instead.
max_tokens = 1000
# max_tokens = 7000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))

# load test corpus
test_corpus = list(
    bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path)))

# run
tf.reset_default_graph()
예제 #7
0
train_f_path = 'data/training/hansards.36.2.f.gz'
dev_e_path = 'data/validation/dev.e.gz'
dev_f_path = 'data/validation/dev.f.gz'
dev_wa = 'data/validation/dev.wa.nonullalign'





# Using only 1000 words will result in many UNKs, but
# it will make training a lot faster.
# If you have a fast computer, a GPU, or a lot of time,
# try with 10000 instead.
max_tokens = 1000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))



dev_corpus = list(bitext_reader(
        smart_reader(dev_e_path),
        smart_reader(dev_f_path)))