def _filter_terms(self): ''' Removes tokens that appear either too often or too rarely. It returns the corpus of filtered tokens and also changes the token and word_frequencies lists of each document according to the filtered tokens. ''' index = Index("kmeans_index") index.add_documents(self.db_documents) index.finalize() filtered_terms = index.get_filtered_terms(lowestf=0.1, highestf=0.2) corpus = [] for id, document in self.document_dict.iteritems(): filtered_tokens = [] for token in document.tokens: if token in filtered_terms: filtered_tokens.append(token) if len(filtered_tokens) == 0: #if all the tokens are removed then this document is worthless self.document_dict.pop(id) else: #if there are still tokens ammend the token list and the word frequencies list to include only the relevant tokens self.document_dict[id].tokens = filtered_tokens self.document_dict[id].word_frequencies = [item for item in self.document_dict[id].word_frequencies if item.word in filtered_tokens] corpus.append(filtered_tokens) return corpus
My playground! ''' import unittest, os from analysis.index import Index from database.warehouse import WarehouseServer from database.model.tweets import TwoGroupsTweet BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") index_path = BASE_PATH + "test_index" ws = WarehouseServer() sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet) index = Index(index_path) for doc in sample_docs: index.add_document(doc) index.finalize() class TestPlayground(unittest.TestCase): def test_searching(self): results = index.search_by_term("sales") calculated = [] for doc in results: calculated.append(doc.get('id')) expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e'] self.assertEqual(expected, calculated) def test_top_terms_index(self): results = index.get_top_terms(10)