Exemplo n.º 1
0
 def _filter_terms(self):
     '''
     Removes tokens that appear either too often or too rarely. It returns the corpus of 
     filtered tokens and also changes the token and word_frequencies lists of each document
     according to the filtered tokens.
     '''
     index = Index("kmeans_index")
     index.add_documents(self.db_documents)
     index.finalize()
     filtered_terms = index.get_filtered_terms(lowestf=0.1, highestf=0.2)
     corpus = []
     for id, document in self.document_dict.iteritems():
         filtered_tokens = []
         for token in document.tokens:
             if token in filtered_terms:
                 filtered_tokens.append(token)
         
         if len(filtered_tokens) == 0: #if all the tokens are removed then this document is worthless
             self.document_dict.pop(id)
         else: #if there are still tokens ammend the token list and the word frequencies list to include only the relevant tokens
             self.document_dict[id].tokens = filtered_tokens
             self.document_dict[id].word_frequencies = [item for item in self.document_dict[id].word_frequencies if item.word in filtered_tokens]
             
         corpus.append(filtered_tokens)
     return corpus
Exemplo n.º 2
0
My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()

class TestPlayground(unittest.TestCase):
  
    def test_searching(self):        
        results = index.search_by_term("sales")
        
        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))
            
        expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e']
        self.assertEqual(expected, calculated)
    
    def test_top_terms_index(self):
        results = index.get_top_terms(10)
Exemplo n.º 3
0
My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()


class TestPlayground(unittest.TestCase):
    def test_searching(self):
        results = index.search_by_term("sales")

        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))

        expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e']
        self.assertEqual(expected, calculated)

    def test_top_terms_index(self):
        results = index.get_top_terms(10)