def run_word_vectors(): print('reading nyt_vec.bin') all_w2vec = utils.read_vec_bin() words2id = utils.load_words() print('prepare w2vec') w2vec = utils.word_vectors(words2id, all_w2vec) print('dumping') json.dump(w2vec, open(Const.words_id2vector_filename, 'w'))
import numpy as np from chinese_whispers import chinese_whispers, aggregate_clusters from utils import triples, grouper, word_vectors, words_vec parser = argparse.ArgumentParser() parser.add_argument('--neighbors', '-n', type=int, default=10) parser.add_argument('--min-weight', type=float, default=0.) parser.add_argument('--pickle', type=argparse.FileType('wb')) parser.add_argument('triples', type=argparse.FileType('r', encoding='UTF-8')) group = parser.add_mutually_exclusive_group() group.add_argument('--w2v', default=None, type=argparse.FileType('rb')) group.add_argument('--pyro', default=None, type=str) args = parser.parse_args() w2v = word_vectors( args, lambda args: parser.error('Please set the --w2v or --pyro option.')) spos, _ = triples(args.triples, min_weight=args.min_weight, build_index=False) vocabulary = { word for triple in spos for word in (triple.subject, triple.predicate, triple.object) } vectors = {} for words in grouper(vocabulary, 512): vectors.update(words_vec(w2v, words)) spos = [
trigram_map = make_map_from_nested(top_tokens['top_trigrams'].values()) results = set() for sen in train['text']: results.update(sen.split()) all_map = {w: i for i, w in enumerate(list(results))} y_train_ints = np.array([class_map[label] for label in train['class']]) y_dev_ints = np.array([class_map[label] for label in dev['class']]) y_test_ints = np.array([class_map[label] for label in test['class']]) y_train = to_categorical(y_train_ints, 10) y_dev = to_categorical(y_dev_ints, 10) y_test = to_categorical(y_test_ints, 10) X_train_words = word_vectors(train['text'], word_map) X_train_trigrams = trigram_vectors(train['text'], trigram_map) X_train_all = all_vectors(train['text'], all_map) X_dev_words = word_vectors(dev['text'], word_map) X_dev_trigrams = trigram_vectors(dev['text'], trigram_map) X_dev_all = all_vectors(dev['text'], all_map) X_test_words = word_vectors(test['text'], word_map) X_test_trigrams = trigram_vectors(test['text'], trigram_map) X_test_all = all_vectors(test['text'], all_map) max_word_len = max(max([len(s) for s in X_train_words]), max([len(s) for s in X_dev_words]), max([len(s) for s in X_test_words])) max_tri_len = max(max([len(s) for s in X_train_trigrams]), max([len(s) for s in X_dev_trigrams]), max([len(s) for s in X_test_trigrams]))