def test_compare(): actual = llr.llr_compare(Counter('abcabcabcababa'), Counter('cccccc')) ref = { 'a': 2.3050260628857417, 'c': -3.6024043433364215, 'b': 2.060150982796662 } assert actual == ref
def reduce_features(self, llr_factor, label1, label2, min_x=0): x1 = [self._X_train[i] for i in range(0, len(self._X_train)) if self._y_train[i] == label1] x2 = [self._X_train[i] for i in range(0, len(self._X_train)) if self._y_train[i] == label2] x1_counter = Counter(' '.join(x1).split()) x2_counter = Counter(' '.join(x2).split()) cmp_results = llr.llr_compare(x1_counter, x2_counter) top_x1 = {k:v for k,v in sorted(cmp_results.items(), key=lambda x: (-x[1], x[0]))[:llr_factor]} top_x2 = {k:v for k,v in sorted(cmp_results.items(), key=lambda x: (x[1], x[0]))[:llr_factor]} X_train_new = self._reduce_helper(self._X_train, top_x1, top_x2, min_x) X_test_new = self._reduce_helper(self._X_test, top_x1, top_x2, min_x) return X_train_new, X_test_new
def calculate_llr(X, y, label1, label2, n=25): x1 = [X[i] for i in range(0, len(X)) if y[i] == label1] x2 = [X[i] for i in range(0, len(X)) if y[i] == label2] x1_counter = Counter(' '.join(x1).split()) x2_counter = Counter(' '.join(x2).split()) cmp_results = llr.llr_compare(x1_counter, x2_counter) top_x1 = { k: v for k, v in sorted(cmp_results.items(), key=lambda x: (-x[1], x[0])) [:n] } top_x2 = { k: v for k, v in sorted(cmp_results.items(), key=lambda x: (x[1], x[0]))[:n] } return top_x1, top_x2
def main(focus_path, other_paths, output_path, num_terms=30): '''Finds the significant words in the text located at focus_path, compared with the rest of the corpus. focus_path - string path to .txt file to focus on other_paths - list of string paths to .txt files comprising the rest of the corpus output_path - string path to .txt file in which to write results num_terms - number of significant terms to show ''' focus_text = count([focus_path]) other_text = count(other_paths) diff = llr.llr_compare(focus_text, other_text) ranked = sorted(diff.items(), key=lambda x: x[1]) with open(output_path, 'w') as output: for word, score in reversed(ranked[-num_terms:]): output.write('{:<20.10} {}\n'.format(score, word))
from collections import Counter import re import llr def count(file): '''Counts the words contained in a file''' with open(file) as f: return Counter(re.findall('\w+', re.sub('[\r\n]', ' ', f.read()))) # Count words in Hamlet hamlet = count('data/hamlet') # and the Declaration of Independence declaration = count('data/declaration') # Find out which words are used more or less diff = llr.llr_compare(hamlet, declaration) ranked = sorted(diff.items(), key=lambda x: x[1]) print("\nMore in Declaration of Independence") for k, v in ranked[:10]: print(k, v) print("\nMore in Hamlet") for k, v in ranked[-10:]: print(k, v)
from collections import Counter import re import llr def count(file): with open(file) as f: return Counter(re.findall('\w+', re.sub('[\r\n]', ' ', f.read()))) hamlet = count('data/hamlet') declaration = count('data/declaration') diff = llr.llr_compare(hamlet, declaration) print("\nMore in Declaration of Independence") for k,v in sorted(diff.items(), key=lambda x: x[1])[:10]: print(k, v) print("\nMore in Hamlet") for k,v in sorted(diff.items(), key=lambda x: x[1])[-10:]: print(k, v)
def test_compare(): actual = llr.llr_compare(Counter('abcabcabcababa'), Counter('cccccc')) ref = {'a': 2.3050260628857417, 'c': -3.6024043433364215, 'b': 2.060150982796662} assert actual == ref