# split training and validation datasets random.shuffle(train_data) num = len(train_data) val_num = int(num * config['val_rate']) val_data = train_data[num - val_num:num] train_data = train_data[0:num - val_num] # build vocabulary vocab = Vocab() for line in train_data: line = line.strip().split('\t')[1].split(' ') vocab.add_list(line) word2index, index2word = vocab.get_vocab(max_size=config['max_size'], min_freq=config['min_freq']) vocab_size = len(index2word) oov_size = len(word2index) - len(index2word) with open(word2index_path, 'wb') as handle: pickle.dump(word2index, handle) with open(index2word_path, 'wb') as handle: pickle.dump(index2word, handle) glove = load_glove(config['glove_path'], vocab_size, word2index) np.save(glove_path, glove)