def get_spelling_ratios(): if os.path.isfile("data/spelling_ratios.pkl"): return data.get_pickle("data/spelling_ratios.pkl") import tables as tb arr_file = tb.open_file("data/docvecs.hdf", "r", filters=tb.Filters(complib='zlib', complevel=0)) docvecs = arr_file.create_earray(arr_file.root, "docvecs") sent_lengths = get_sentence_lengths() sent_lengths = list(map(lambda sent: sum(sent), sent_lengths)) print(sent_lengths[:2]) ratios = [] for i, embeddings in enumerate(tqdm(docvecs)): count = 0 for embedding in embeddings: if not np.any(embedding): break count += 1 ratio = count / min(sent_lengths[i], 300) ratios.append(ratio) data.save_pickle("data/spelling_ratios.pkl", ratios) return ratios
def discreticize_labels(): if os.path.isfile("data/labels_disc.pkl"): return import data_selection as select ratings = select.get_selection().helpful.astype(int).tolist() n_values = np.max(ratings) + 1 discrete = np.eye(n_values)[ratings] data.save_pickle("data/labels_disc.pkl", discrete) print(discrete[:2])
def extract_tags(texts): if os.path.isfile(TAGS_DIR): return data.get_pickle(TAGS_DIR) else: texts = preprocess(texts) texts = get_pos_tags(map(lambda text: text.split(), texts)) tags = list(map(lambda texts: [tagged[1] for tagged in texts], texts)) data.save_pickle(TAGS_DIR, tags) return tags
def discreticize_ratings(): if os.path.isfile("data/ratings_disc.pkl"): return import data_selection as select ratings = select.get_selection().overall.astype(int).tolist() ratings = list(map(lambda r: r - 1, ratings)) n_values = np.max(ratings) + 1 discrete = np.eye(n_values)[ratings] data.save_pickle("data/ratings_disc.pkl", discrete) print(discrete[:2])
def normalize_tag_bow(): if os.path.isfile("data/tags_bow_norm.pkl"): return tag_counts = data.get_pickle(TAGS_BOW) result = [] for tag in tag_counts: res = tag if any(tag): res = normalize(tag) result.append(res) data.save_pickle("data/tags_bow_norm.pkl", result) print(result[0])
def get_tags_bow(sentences): if os.path.isfile(TAGS_BOW): return data.get_pickle(TAGS_BOW) else: from collections import Counter from nltk.data import load corpus = list(load('help/tagsets/upenn_tagset.pickle').keys()) f = lambda x: Counter([y for y in x if y in corpus]) df = pd.DataFrame({"tags": sentences}) df["bow"] = (pd.DataFrame(df["tags"].apply(f).values.tolist()).reindex( columns=corpus).fillna(0).astype(int).values.tolist()) result = df["bow"].tolist() data.save_pickle(TAGS_BOW, result) return result
def get_review_counts(): import data_selection as select sel = select.get_selection() reviewer_counts = sel.reviewerID.value_counts() reviewer_result = [] for index, row in sel.reviewerID.iteritems(): reviewer_result.append(reviewer_counts[row]) product_counts = sel.asin.value_counts() product_result = [] for index, row in sel.asin.iteritems(): product_result.append(product_counts[row]) data.save_pickle("data/reviewer_review_counts.pkl", reviewer_result) data.save_pickle("data/product_review_counts.pkl", product_result)
def preprocess(texts): if os.path.isfile(PREPROCESSED_DIR): return data.get_pickle(PREPROCESSED_DIR) else: result = [] for text in texts: text = clean(text) text = get_sentences(text) text = get_words(text) # text = get_pos_tags(text) # text = lemmatize(text) text = flatten_sentences(text) text = flatten_paragraph(text) print(text) result.append(text) data.save_pickle(PREPROCESSED_DIR, result) return result
def get_sent_word_distribution(): sent_lengths = get_sentence_lengths() sent_nums = list(map(lambda sent: len(sent), sent_lengths)) sent_nums_normed = normalize(sent_nums) word_counts = list(map(lambda sent: sum(sent) / len(sent), sent_lengths)) word_counts_normed = normalize(word_counts) data.save_pickle("data/sent_counts.pkl", sent_nums) data.save_pickle("data/sent_counts_norm.pkl", sent_nums_normed) data.save_pickle("data/word_counts.pkl", word_counts) data.save_pickle("data/word_counts_norm.pkl", word_counts_normed)
def get_tfidf(): documents = data.get_pickle("data/preprocessed.pkl") documents = list(map(lambda d: d.split(" "), documents)) print(documents[:2]) vectorizer = TfidfVectorizer( analyzer="word", token_pattern=None, stop_words="english", tokenizer=dummy, preprocessor=dummy, ngram_range=(1,3), max_features=12000, max_df=0.99 ) matrix = vectorizer.fit_transform(documents) data.save_pickle("data/tfidf.pkl", matrix) print(vectorizer.get_feature_names())
def get_sentence_lengths(): if os.path.isfile("data/sentence_lengths.pkl"): return data.get_pickle("data/sentence_lengths.pkl") else: texts = data.get_pickle("data/preprocessed.pkl") texts = list(map(lambda t: t.split(" "), texts)) result = [] for review in texts: review_len = len(review) sent_lengths = [] c = 1 for i, word in enumerate(review): if word in ".!?" or i == review_len - 1: sent_lengths.append(c) c = 1 else: c += 1 result.append(sent_lengths) data.save_pickle("data/sentence_lengths.pkl", result) return result
def get_word_counts(texts): from statistics import mean if os.path.isfile("data/word_counts.pkl"): return data.get_pickle("data/word_counts.pkl") else: result = [] i = 0 for text in texts: text = clean(text) sent = get_sentences(text) word = get_words(sent) sentence_lengths = [len(sentence) for sentence in word] try: if sentence_lengths: result.append(sum(sentence_lengths)) else: result.append(0) i += 1 except: print(text, i) return data.save_pickle("data/word_counts.pkl", result) return result
def normalize_word_counts(): word_counts = data.get_pickle("data/word_counts.pkl") normed = normalize(word_counts) data.save_pickle("data/word_counts_norm.pkl", normed) print(normed)
def normalize_sentence_lengths(): sentence_lengths = data.get_pickle("data/sentence_lengths.pkl") normed = normalize(sentence_lengths) data.save_pickle("data/sentence_lengths_norm.pkl", normed) print(normed)