def get_word2vec(): import data_manager as data import numpy as np import os from gensim.models import KeyedVectors from tqdm import tqdm import tables as tb print("loading word2vec...") model = data.get_pickle("data/word2vec.model.pkl") # model = KeyedVectors.load_word2vec_format(os.path.join(os.path.dirname(__file__), "data/word2vec.300d.txt")) # data.save_pickle("data/word2vec.model.pkl", model) print("word2vec loaded") texts = data.get_pickle("data/preprocessed.pkl") print("splitting sentences into words") texts = list(map(lambda t: t.split(" "), texts)) print("done!") arr_file = tb.open_file("data/docvecs.hdf", "w", filters=tb.Filters(complib='zlib', complevel=0)) document_vecs = None # arr_file.create_earray(arr_file.root, "docvecs") for sentence in tqdm(texts): word_vecs = [] for word in sentence: try: word_vecs.append(model.get_vector(word)) except: pass # if len(word_vecs) == 0: # word_vecs.append([0 for i in range(300)]) # document_vecs.append(np.mean(word_vecs, axis=0)) # Tried keeping order and padding but too heavy if len(word_vecs) > 300: word_vecs = word_vecs[:300] len_vecs = len(word_vecs) for i in range(300 - len_vecs): word_vecs.append([0 for i in range(300)]) word_vecs = np.array([word_vecs]) if document_vecs is None: document_vecs = arr_file.create_earray(arr_file.root, "docvecs", obj=word_vecs) else: document_vecs.append(word_vecs) #data.save_pickle("data/wordvecs_np.pkl", document_vecs) return document_vecs
def get_spelling_ratios(): if os.path.isfile("data/spelling_ratios.pkl"): return data.get_pickle("data/spelling_ratios.pkl") import tables as tb arr_file = tb.open_file("data/docvecs.hdf", "r", filters=tb.Filters(complib='zlib', complevel=0)) docvecs = arr_file.create_earray(arr_file.root, "docvecs") sent_lengths = get_sentence_lengths() sent_lengths = list(map(lambda sent: sum(sent), sent_lengths)) print(sent_lengths[:2]) ratios = [] for i, embeddings in enumerate(tqdm(docvecs)): count = 0 for embedding in embeddings: if not np.any(embedding): break count += 1 ratio = count / min(sent_lengths[i], 300) ratios.append(ratio) data.save_pickle("data/spelling_ratios.pkl", ratios) return ratios
def extract_tags(texts): if os.path.isfile(TAGS_DIR): return data.get_pickle(TAGS_DIR) else: texts = preprocess(texts) texts = get_pos_tags(map(lambda text: text.split(), texts)) tags = list(map(lambda texts: [tagged[1] for tagged in texts], texts)) data.save_pickle(TAGS_DIR, tags) return tags
def normalize_tag_bow(): if os.path.isfile("data/tags_bow_norm.pkl"): return tag_counts = data.get_pickle(TAGS_BOW) result = [] for tag in tag_counts: res = tag if any(tag): res = normalize(tag) result.append(res) data.save_pickle("data/tags_bow_norm.pkl", result) print(result[0])
def get_sentence_lengths(): if os.path.isfile("data/sentence_lengths.pkl"): return data.get_pickle("data/sentence_lengths.pkl") else: texts = data.get_pickle("data/preprocessed.pkl") texts = list(map(lambda t: t.split(" "), texts)) result = [] for review in texts: review_len = len(review) sent_lengths = [] c = 1 for i, word in enumerate(review): if word in ".!?" or i == review_len - 1: sent_lengths.append(c) c = 1 else: c += 1 result.append(sent_lengths) data.save_pickle("data/sentence_lengths.pkl", result) return result
def get_tags_bow(sentences): if os.path.isfile(TAGS_BOW): return data.get_pickle(TAGS_BOW) else: from collections import Counter from nltk.data import load corpus = list(load('help/tagsets/upenn_tagset.pickle').keys()) f = lambda x: Counter([y for y in x if y in corpus]) df = pd.DataFrame({"tags": sentences}) df["bow"] = (pd.DataFrame(df["tags"].apply(f).values.tolist()).reindex( columns=corpus).fillna(0).astype(int).values.tolist()) result = df["bow"].tolist() data.save_pickle(TAGS_BOW, result) return result
def preprocess(texts): if os.path.isfile(PREPROCESSED_DIR): return data.get_pickle(PREPROCESSED_DIR) else: result = [] for text in texts: text = clean(text) text = get_sentences(text) text = get_words(text) # text = get_pos_tags(text) # text = lemmatize(text) text = flatten_sentences(text) text = flatten_paragraph(text) print(text) result.append(text) data.save_pickle(PREPROCESSED_DIR, result) return result
def get_tfidf(): documents = data.get_pickle("data/preprocessed.pkl") documents = list(map(lambda d: d.split(" "), documents)) print(documents[:2]) vectorizer = TfidfVectorizer( analyzer="word", token_pattern=None, stop_words="english", tokenizer=dummy, preprocessor=dummy, ngram_range=(1,3), max_features=12000, max_df=0.99 ) matrix = vectorizer.fit_transform(documents) data.save_pickle("data/tfidf.pkl", matrix) print(vectorizer.get_feature_names())
def get_word_counts(texts): from statistics import mean if os.path.isfile("data/word_counts.pkl"): return data.get_pickle("data/word_counts.pkl") else: result = [] i = 0 for text in texts: text = clean(text) sent = get_sentences(text) word = get_words(sent) sentence_lengths = [len(sentence) for sentence in word] try: if sentence_lengths: result.append(sum(sentence_lengths)) else: result.append(0) i += 1 except: print(text, i) return data.save_pickle("data/word_counts.pkl", result) return result
validation_size = len(self.validation_split) while True: if self.current_valid_index >= validation_size - self.n: self.current_valid_index = 0 indices = self.validation_split[self.current_valid_index:self.current_valid_index + self.n] self.current_valid_index += self.n yield self.get_items(indices) def reset(self): self.current_train_index = 0 self.current_valid_index = 0 arr_file = tb.open_file("data/docvecs.hdf", "r", filters=tb.Filters(complib='zlib', complevel=0)) embeddings = arr_file.create_earray(arr_file.root, "docvecs") tfidf = data.get_pickle("data/tfidf.pkl").toarray() product_counts = np.array(data.get_pickle("data/product_review_counts.pkl")) reviewer_counts = np.array(data.get_pickle("data/reviewer_review_counts.pkl")) ratings = np.array(data.get_pickle("data/ratings_disc.pkl")) sent_counts = np.array(data.get_pickle("data/sent_counts_norm.pkl")) word_counts = np.array(data.get_pickle("data/word_counts_norm.pkl")) spell_ratios = np.array(data.get_pickle("data/spelling_ratios.pkl")) pos_dist = np.array(data.get_pickle("data/tags_bow_norm.pkl")) helpfulness = np.array(data.get_pickle("data/labels_disc.pkl")) batch_size = 128 batcher = Batcher(product_counts, reviewer_counts, ratings, sent_counts, word_counts, spell_ratios, pos_dist, tfidf, embeddings, helpfulness, batch_size) def train_cnn():
def normalize_word_counts(): word_counts = data.get_pickle("data/word_counts.pkl") normed = normalize(word_counts) data.save_pickle("data/word_counts_norm.pkl", normed) print(normed)
def normalize_sentence_lengths(): sentence_lengths = data.get_pickle("data/sentence_lengths.pkl") normed = normalize(sentence_lengths) data.save_pickle("data/sentence_lengths_norm.pkl", normed) print(normed)
def test(): import numpy as np matrix = data.get_pickle("data/tfidf.pkl").toarray() indices = [1,2,3] print([matrix[i] for i in indices])
def get_tags(): return data.get_pickle("data/text_tags.pkl")