def test_make_sampling_table():
    a = sequence.make_sampling_table(3)
    assert_allclose(a, np.asarray([0.00315225, 0.00315225, 0.00547597]),
                    rtol=.1)
def test_make_sampling_table():
    a = sequence.make_sampling_table(3)
    assert_allclose(a,
                    np.asarray([0.00315225, 0.00315225, 0.00547597]),
                    rtol=.1)
示例#3
0
    #shuffle(unpadded_x)

    #flat_list = [item for sublist in unpadded_x for item in sublist]
    flat_list = []
    for sublist in unpadded_x:
        flat_list.extend(sublist)
        flat_list.extend([0]*window_size)

    print(f'start generating skip-grams | len:{len(flat_list)}')

    #ITERATIONS = 30000

    #grams_x = []
    #grams_y = []
    #for i, doc in enumerate(unpadded_x):
    sampling_table = sequence.make_sampling_table(vocab_size)
    data, labels = skipgrams(sequence=flat_list, vocabulary_size=vocab_size, window_size=window_size,
                             negative_samples=1., sampling_table=sampling_table)
    #grams_x.extend(data)
    #grams_y.extend(labels)

    #    if i % 1000 == 0:
    #        print(f'progress: {i / ITERATIONS*100}%')

    #    if i == ITERATIONS:
    #        break

    print(f'generated {len(data)} samples')
    save_pickle(data, 'tokenized/learn/grams_x.pickle')
    save_pickle(labels, 'tokenized/learn/grams_y.pickle')