def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def test_get_doc_by_id(self): mx = Matrix() for i, s in enumerate(['hello', 'how are you', 'fine thank you']): mx.add_doc(doc_id=str(i), doc_class='Email', doc_terms=s.split(), do_padding=True, frequency=True) doc1_id = mx.docs.index('1') self.assertEqual(mx.docs[doc1_id]['id'], '1')
def test_white_and_black_lists(self): doc_terms = ['this', 'is', 'a', 'new', 'test'] white_list = ['test'] black_list = ['this', 'is', 'a'] mx = Matrix(whitelist=white_list, blacklist=black_list) mx.add_doc(doc_id='doc1', doc_class='TestClass', doc_terms=doc_terms, do_padding=True, frequency=True) returned_terms = mx.vocabulary() expected_terms = ['test'] self.assertItemsEqual(returned_terms, expected_terms)
def test_meta_data(self): mx = Matrix() for i, s in enumerate(['hello', 'world']): mx.add_doc(doc_id=str(i), doc_class='Email', doc_terms=s.split(), do_padding=True, frequency=True, meta_data={ 'original_text': s, 'original_text_len': len(s) }) self.assertEqual(mx.docs[1]['original_text'], 'world') self.assertEqual(mx.docs[1]['original_text_len'], 5)
def test_docs_unique_ids(self): mx = Matrix() for i, s in enumerate(['hello', 'how are you', 'fine thank you']): mx.add_doc(doc_id=str(i), doc_class='Email', doc_terms=s.split(), do_padding=True, frequency=True) mx.add_doc(doc_id='1', doc_class='Email', doc_terms='goodbye'.split(), do_padding=True, frequency=True, unique_ids=True) self.assertEqual(len(mx), 3)
def readfiles(fold_path='all-folds/fold1/'): prep = Preprocessor() mx = Matrix() files = os.listdir(fold_path) for filename in files: fd = open('%s/%s' % (fold_path, filename), 'r') file_data = fd.read() terms = prep.ngram_tokenizer(text=file_data) mx.add_doc(doc_id=filename, doc_terms=terms, frequency=True, do_padding=True) print 'Number of read documents:', len(mx.docs) print 'Number of read terms', len(mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] print mx.terms print mx.docs
# Load the three modules: from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics import difflib # Create instances for their classes: prep = Preprocessor() mx = Matrix() metric = Metrics() q_vector = [] def generateMatrix(): fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r') count = 1 for line in fd.readlines(): terms = line.split(' ') terms = [x.strip() for x in terms] mx.add_doc(doc_id=str(count), doc_terms=terms, frequency=True, do_padding=True) count += 1 if count % 1000 == 0: print count mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)
def setUp(self): self.m = Matrix()
def __init__(self): self._mx = Matrix() self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)