# -*- coding: utf-8 -*- """ Created on Mon Jun 19 14:38:15 2017 @author: kcarnold """ import numpy as np #%% from suggestion.analyzers import load_reviews reviews = load_reviews() #%% from collections import Counter import itertools import tqdm vocab = Counter( itertools.chain.from_iterable( text.lower().split() for text in tqdm.tqdm(reviews[reviews.is_train].tokenized))) #%% MAX_SEQ_LEN = 100 #%% NUM_WORDS = 20000 id2str = ['<PAD>', '<UNK>' ] + [word for word, count in vocab.most_common(NUM_WORDS)] str2id = {word: idx for idx, word in enumerate(id2str)} #%% from suggestion import clustering cnnb = clustering.ConceptNetNumberBatch.load() #%% EMBEDDING_DIM = 300
from suggestion import analyzers from suggestion.paths import paths import pickle reviews = analyzers.load_reviews() wordpair_analyzer = analyzers.WordPairAnalyzer.build(reviews) with open(paths.models / 'wordpair_analyzer.pkl', 'wb') as f: pickle.dump(wordpair_analyzer, f, -1)