def test_bulid_word_index(self): concepts_list = [ Concept(0, 'title0', ['a', 'b', 'c']), Concept(1, 'title1', ['b', 'c']), Concept(2, 'title2', ['x', 'c']), ] expected = ['a', 'b', 'c', 'x'] actual = dbb.build_word_index(concepts_list) self.assertEqual(set(expected), set(actual))
def test_bulid_word_index(self): concepts_list = [ Concept(0, 'title0', ['a', 'b', 'c']), Concept(1, 'title1', ['b', 'c']), Concept(2, 'title2', ['x', 'c']), ] expected =['a', 'b', 'c', 'x'] actual = dbb.build_word_index(concepts_list) self.assertEqual(set(expected), set(actual))
def build(self, wf=None, normalization=True): ''' Builds DatabaseWrapper according to algorithm @param wf: workflow for debug purpuses @returns: DatabaseWrapper ''' _log.info("Start building inverted index") _log.info("Normalization={}".format(normalization)) _log.info("Building word index") #unique enumeration of words (list of words and index is a posiioin of the word in list) self.word_index = build_word_index(self.concepts_list) _log.info("Number of terms={}".format(len(self.word_index))) _log.info("Number of concepts={}".format(len(self.concepts_list))) #word => index in word_index index_by_word = build_index_by_words(self.word_index) # docs per word df_vec = build_df(index_by_word, self.concepts_list) _log.info("DF vector build is DONE") # weight table not normalized T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list) _log.info("ID-TDF vector build is DONE") if normalization: normalize(T) _log.info("Normalization is DONE") db = DatabaseWrapper(T, self.concepts_list, self.word_index, self.stemmer) _log.info("Database wrapper created") if wf: wf.word_index = self.word_index #workaround to force returned wf to be sparse wf.df_vec = matrix(df_vec) wf.wieghts_mat = T return db
def get_word_index(self): if self.word_index is None: return build_word_index(self.concepts_list) return self.word_index
def test_build_word_index_empty(self): expected = [] actual = dbb.build_word_index([]) self.assertEqual(expected, actual)
def test_build_word_index_empty(self): expected =[] actual = dbb.build_word_index([]) self.assertEqual(expected, actual)