Пример #1
0
    def test_shouldCheckForConsistencyOfLSIModel(self):
        import os
        filepath = os.path.join(os.path.dirname(__file__), "test_data/reuters_rupee_decline/doc1")
        text = FileReader.read(filepath)
        processor = TextProcessor()
        sentences = processor.nltk_sentences(text)
        tokenised_sentence_map = dict(
            [(index, processor.stopped_tokenize(sentence)) for index, sentence in enumerate(sentences)])


        for i in range(5):
            print "\n\n************* ITERATION ", i, " *************"
            lsi_transformation = LSITransformation(tokenised_sentence_map)
            lsi_transformation.print_transformation()
Пример #2
0
    def best_community(self, community_levels, tokenised_sentences_dict):
        best_communities = self.dissimilar_sentences.find_best_community_level(community_levels)
        communities_subgraphs = best_communities.subgraphs()
        best_community_id = 0
        best_community_index = 0.0
        text_processor = TextProcessor()
        for id, community in enumerate(communities_subgraphs):
            vertices = community.vs["name"]
            sigma_info_index = 0.0
            for vertex in vertices:
                sentence = tokenised_sentences_dict[vertex]
                info_index = text_processor.information_index(sentence)
                sigma_info_index += info_index
            sigma_info_index /= float(len(vertices))
            if best_community_index < sigma_info_index:
                best_community_index = sigma_info_index
                best_community_id = id

        return communities_subgraphs[best_community_id]
Пример #3
0
 def __init__(self, list_of_tokens):
     set_of_tokens = Sets.union_all(list_of_tokens)
     self.synonyms = TextProcessor.synonyms_for(set_of_tokens)