Пример #1
0
        with open(f + 'en') as ens, open(f + 'de') as des: 
            for i, (en, de) in enumerate(islice(izip(ens, des), self.size)):
                en = ['%s_en'%w for w in preprocess(en).split()]
                de = ['%s_de'%w for w in preprocess(de).split()]
                langs = [en, de]
                shuffle(langs)
                l1, l2 = langs
                yield LabeledSentence(words=l1, labels=l2)

print 'Learning to predict all l2 words from every l1 word'
start = timeit.default_timer()

f = sys.argv[1]+'/europarl-v7.de-en.'
n = 50000
sentences = BitextWordLabelsSentence(f, n)
print '%s sentences' % n

model = Doc2Vec(dm=0, alpha=0.025, min_alpha=0.025, size=256)
model.build_vocab(sentences)
print '%s words in vocab' % len(model.vocab)

print 'epochs'
for epoch in range(10):
    model.train(sentences)
    print epoch
    model.alpha -= 0.002  # decrease the learning rate
stop = timeit.default_timer()
print 'Running time %ss' % (stop - start)

inspect_words(model)
Пример #2
0
for w in model_de.vocab:
    model_de.syn0[model_de.vocab[w].index] /= (model_de.vocab[w].count + 1.0)
    if w in model:
        model.syn0[model.vocab[w].index] = model_de.syn0[
            model_de.vocab[w].index]

model.syn0norm = (model.syn0 / sqrt(
    (model.syn0**2).sum(-1))[..., newaxis]).astype(REAL)
model_en.syn0norm = (model_en.syn0 / sqrt(
    (model_en.syn0**2).sum(-1))[..., newaxis]).astype(REAL)
model_de.syn0norm = (model_de.syn0 / sqrt(
    (model_de.syn0**2).sum(-1))[..., newaxis]).astype(REAL)

print 'Running time %ss' % (timeit.default_timer() - start)

inspect_words(model)
inspect_words(model_en)
inspect_words(model_de)

# Train two vocabularies
print 'epochs'
for epoch in range(10):
    sentences_en = (LabeledSentence(words=en, labels=[pen])
                    for pen, en, de in BitextTriples(f, n))
    sentences_de = (LabeledSentence(words=de, labels=[pen])
                    for pen, en, de in BitextTriples(f, n))
    model_en.train(sentences_en)
    model_de.train(sentences_de)
    print epoch
    model_en.alpha -= 0.001  # decrease the learning rate
    model_de.alpha -= 0.001  # decrease the learning rate