def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos = 0 neg = 0 for review in posids: pos += 1 if (pos != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['pos'].update(token_helpers.tokenize_simple(word)) for review in negids: neg += 1 if (neg != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['neg'].update(token_helpers.tokenize_simple(word)) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords """
def tokenize_advanced(text, weight): """ Tokenizes a string and returns it as a bag of words, including bigrams Parameters ---------- text : str A string of raw text to be tokenized weight : int The weight to give to each bigram Returns ------- list of strs One string for each token in the document, in the same order as the original """ global counter counter = counter + 1 print(counter) tokens = token_helpers.tokenize_simple(text) tagged_tokens = pos_tag(tokens, tagset="universal") result = [] previous = None # Will be replaced later with internal functions from a pos.py file for index, t in enumerate(tagged_tokens): if t[1] == "ADJ" and index != 0: previous = tagged_tokens[index - 1] result.append(previous[0] + " " + t[0]) for bigram in result: tokens = tokens + ([bigram] * weight) return tokens
def get_count_vect(data): """ Takes json data and return a document term matrix consisting of all the tokens within the text of review data and how many times they are used (with stopwords removed) Parameters ---------- data : list of dicts The yelp data to be analyzed Returns ------- A transformed count vectorizer Returns a document term matrix of all tokens and their frequencies """ tokens = [] count_vect = CountVectorizer(stop_words=stopwords.words("english")) for review in data: tokens += token_helpers.tokenize_simple(review["text"]) return count_vect.fit_transform(tokens)
def get_freq_dist(data): """ Takes json data and returns its text as a Frequency Distribution Parameters ---------- data : list of dicts The yelp data to be analyzed Returns ------- An nltk Frequency Distribution Returns the frequency distribution of all tokens within the reviews of the yelp data """ tokens = [] for review in data: tokens += token_helpers.tokenize_simple(review["text"]) tokens = token_helpers.remove_stopwords_inner( tokens, stopwords=stopwords.words("english") + ["time", "would", "got", "i'm", "-", "food", "like", "really", "service"], ) return FreqDist(tokens)