Exemplo n.º 1
0
def load_data(args):
    train, validate, test = process_data.get_data(
        args.text_only)  # get_data(args.text_only)
    #print(train[4][0])
    word_vector_path = '../Data/weibo/word_embedding.pickle'
    f = open(word_vector_path, 'rb')
    weight = pickle.load(f)  # W, W2, word_idx_map, vocab
    W, W2, word_idx_map, vocab, max_len = weight[0], weight[1], weight[
        2], weight[3], weight[4]
    args.vocab_size = len(vocab)
    args.sequence_len = max_len
    print("translate data to embedding")

    word_embedding, mask = word2vec(validate['post_text'], word_idx_map, W)
    validate['post_text'] = word_embedding
    validate['mask'] = mask

    print("translate test data to embedding")
    word_embedding, mask = word2vec(test['post_text'], word_idx_map, W)
    test['post_text'] = word_embedding
    test['mask'] = mask
    #test[-2]= transform(test[-2])

    word_embedding, mask = word2vec(train['post_text'], word_idx_map, W)
    train['post_text'] = word_embedding
    train['mask'] = mask

    print("sequence length " + str(args.sequence_length))
    print("Train Data Size is " + str(len(train['post_text'])))
    print("Finished loading data ")
    return train, validate, test, W
Exemplo n.º 2
0
        test_str = text.split(' ')
        if len(test_str) > max_:
            max_ = len(test_str)
        for v in test_str:
            if v not in vocab_idx:
                vocab_idx[v] = c
                c += 1
        datas.append((text, image, label, event))

    print(len(datas))

    save_data(datas, name)
    return vocab_idx, c, max_


train, validate, test = process_data.get_data(False)

vocab_idx = {}
c = 0
vocab_idx, c, max_ = create_data(train, '../data_new/train_data.bin',
                                 vocab_idx, c, max_)
print(len(vocab_idx), c)
vocab_idx, c, max_ = create_data(validate, '../data_new/validate_data.bin',
                                 vocab_idx, c, max_)
print(len(vocab_idx), c)
vocab_idx, c, max_ = create_data(test, '../data_new/test_data.bin', vocab_idx,
                                 c, max_)
print(len(vocab_idx), c)

print(c)
vocab_idx['<unk>'] = c