Пример #1
0
 def build_weight(self):
     n = V.shape[0]
     m = V.shape[1]
     self.weight = matrix((n, n))
     self.weight_sum = matrix((n, n))
     for f in self.F:
         for i in range(m):
             self.assign_weight(f[i], f[(i+1) % 3])
Пример #2
0
 def test__centroid(self):
     vectors = [
         matrix([0,0,8]), 
         matrix([6,0,2]), 
         matrix([3,0,-5]), 
     ]
     expected_vec = matrix([3,0,5.0/3])
     actual_vec = math_utils.get_vectors_centroid(vectors)
     numpy.testing.assert_array_almost_equal(expected_vec.todense(), actual_vec.todense(), err_msg="Centroid wrong calculations!")
Пример #3
0
 def test__centroid(self):
     vectors = [
         matrix([0, 0, 8]),
         matrix([6, 0, 2]),
         matrix([3, 0, -5]),
     ]
     expected_vec = matrix([3, 0, 5.0 / 3])
     actual_vec = math_utils.get_vectors_centroid(vectors)
     numpy.testing.assert_array_almost_equal(
         expected_vec.todense(),
         actual_vec.todense(),
         err_msg="Centroid wrong calculations!")
Пример #4
0
def simple_wf():
    wf = WorkFlow()

    # declare docs:
    wf.docs = [
        Doc("Testing advanced 1", "a b c c c d d d d e"),
        Doc("Testing advanced 2", "a a a a a b c c c d d d e e"),
        Doc("Testing advanced 3", "b b b b f f f f"),
    ]

    # declare preprocessing
    wf.title_index = [doc.title for doc in wf.docs]
    wf.word_index = ['a', 'b', 'c', 'd', 'e', 'f']

    # prepare tf matrix
    wf.count_matrix = array([
        [1, 5, 0],  #a 
        [1, 1, 4],  #b 
        [3, 3, 0],  #c 
        [4, 3, 0],  #d 
        [1, 2, 0],  #e 
        [0, 0, 4],  #f 
    ])
    count_to_tf = vectorize(math_utils.count_to_tf)
    wf.tf_mat = count_to_tf(wf.count_matrix)

    # prepare lidf vector: log of inverted df
    wf.n_docs = float(len(wf.docs))
    wf.df_vec = array([2.0, 3.0, 2.0, 2.0, 2.0, 1.0])
    wf.idf_vec = wf.n_docs / wf.df_vec
    wf.log_idf_vec = log(wf.idf_vec)
    wf.wieghts_mat = matrix(wf.tf_mat * wf.log_idf_vec[:, None])

    return wf
Пример #5
0
def generate_Stream_GM():
    sparse = matrix((Stream,GM),dtype=np.bool_)
    for i in tqdm(range(Stream)):
        connected = np.random.randint(low=0, high=GM, size=20)
        for j in connected:
            sparse[i,j]=1
    return sparse
Пример #6
0
def simple_wf():
    wf = WorkFlow()

    
    # declare docs:
    wf.docs = [
        Doc("Testing advanced 1",  "a b c c c d d d d e"),
        Doc("Testing advanced 2", "a a a a a b c c c d d d e e"),
        Doc("Testing advanced 3", "b b b b f f f f"),
    ]
    
    # declare preprocessing
    wf.title_index = [ doc.title for doc in wf.docs] 
    wf.word_index =  ['a', 'b', 'c', 'd', 'e', 'f']
    
    # prepare tf matrix     
    wf.count_matrix = array([ 
        [1, 5, 0], #a 
        [1, 1, 4], #b 
        [3, 3, 0], #c 
        [4, 3, 0], #d 
        [1, 2, 0], #e 
        [0, 0, 4], #f 
    ])      
    count_to_tf = vectorize(math_utils.count_to_tf)
    wf.tf_mat = count_to_tf(wf.count_matrix)

    # prepare lidf vector: log of inverted df
    wf.n_docs       = float(len(wf.docs))                
    wf.df_vec       = array([2.0, 3.0, 2.0, 2.0, 2.0, 1.0])
    wf.idf_vec      = wf.n_docs / wf.df_vec
    wf.log_idf_vec  = log(wf.idf_vec)
    wf.wieghts_mat  = matrix(wf.tf_mat * wf.log_idf_vec[:,None])
    
    return wf
Пример #7
0
 def getSimpleDb(self):
     concepts_index=['c1','c2']
     words_index=['a','b','c']
     wieght_matrix =matrix(
           [[0.5, 0.5],
            [0.2, 0.8],
            [1.0, 0.0]])
     db = DatabaseWrapper( wieght_matrix, concepts_index, words_index, StopWordsStemmer([]))
     return db
Пример #8
0
def generate_WM_Stream():

    # Specific to DOK matrix
    sparse = matrix((WM,Stream),dtype=np.bool_)
    for i in tqdm(range(WM)):
        connected = np.random.randint(low=0, high=Stream, size=average_stream_per_vox)
        for j in connected:
            sparse[i,j] = 1
    return sparse
Пример #9
0
 def test_simple(self):
     # arrange
     db = self.getSimpleDb()
     
     text = "a b c"
        
     expected = matrix([1.7/3, 1.3/3])
     # act
     actual  = db.get_text_centroid(text)
     numpy.testing.assert_array_almost_equal(expected.todense(), actual.todense(), err_msg="wrong centroid")
Пример #10
0
def get_vectors_centroid(list_of_vectors):
    """ gets a list of scipy vectors with same dimensions and returns their centroid"""
    n = len(list_of_vectors)
    if n == 0: return
    #on 1d vector, shape holds the length
    shape = list_of_vectors[0].shape
    ret_vec = matrix(shape)
    for vector in list_of_vectors:
        #_log.debug("get_vectors_centroid: Adding vector {}".format(vector))
        ret_vec = ret_vec + vector
    ret_vec = ret_vec * (1.0 / n)
    return ret_vec
Пример #11
0
    def test_migration(self):
        """tests that new form of test equals to old one  from test__advanced_doc"""

        expected = matrix([[0.40546511, 1.05803603, 0.], [0., 0., 0.],
                           [0.85091406, 0.85091406, 0.],
                           [0.9675591, 0.85091406, 0.],
                           [0.40546511, 0.6865121, 0.], [0., 0., 2.62161231]])

        wf = simple_wf()
        actual = wf.wieghts_mat

        assert_allclose(actual.todense(), expected.todense())
Пример #12
0
def get_vectors_centroid(list_of_vectors):
    """ gets a list of scipy vectors with same dimensions and returns their centroid"""
    n = len(list_of_vectors)
    if n == 0: return
    #on 1d vector, shape holds the length
    shape = list_of_vectors[0].shape
    ret_vec = matrix(shape)
    for vector in list_of_vectors:
        #_log.debug("get_vectors_centroid: Adding vector {}".format(vector))
        ret_vec = ret_vec + vector
    ret_vec = ret_vec * (1.0 / n) 
    return ret_vec 
Пример #13
0
 def get_word_vector(self, word):
     """ 
         Row representation of the word in Wiki concepts
         @returns: the text vector in wikipedia space.
     """
     vector = None
     if self.index_by_word.has_key(word):
         index = self.index_by_word[word]
         vector = self.wieght_matrix[index, :]
     else:
         #if word is not in corpus: return empty vector            
         vector = matrix((1,self.concepts_num))
     return vector
Пример #14
0
 def get_word_vector(self, word):
     """ 
         Row representation of the word in Wiki concepts
         @returns: the text vector in wikipedia space.
     """
     vector = None
     if self.index_by_word.has_key(word):
         index = self.index_by_word[word]
         vector = self.wieght_matrix[index, :]
     else:
         #if word is not in corpus: return empty vector
         vector = matrix((1, self.concepts_num))
     return vector
Пример #15
0
    def test_migration(self):
        """tests that new form of test equals to old one  from test__advanced_doc"""

        expected = matrix([
        [ 0.40546511,  1.05803603,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        [ 0.85091406,  0.85091406,  0.        ],
        [ 0.9675591 ,  0.85091406,  0.        ],
        [ 0.40546511,  0.6865121 ,  0.        ],
        [ 0.        ,  0.        ,  2.62161231]])  
        
        wf = simple_wf()
        actual = wf.wieghts_mat
        
        assert_allclose(actual.todense(), expected.todense())
Пример #16
0
    def test__advanced_doc(self):
        """tests that new form of test equals to old one  from test__advanced_doc"""

        expected_wf = simple_wf()
        
        builder = DbBuilder(StopWordsStemmer([]))
        for doc in expected_wf.docs:
            builder.add_document(doc)
        
        actual_wf = WorkFlow()
        
        builder.build(wf=actual_wf, normalization=False) 
        #workaround to handle dimensions mismatch
        expected_wf.df_vec = matrix(expected_wf.df_vec)
        assert_allclose(actual_wf.df_vec.todense(), expected_wf.df_vec.todense())    
        assert_allclose(actual_wf.wieghts_mat.todense(), expected_wf.wieghts_mat.todense())
Пример #17
0
    def test__advanced_doc(self):
        """tests that new form of test equals to old one  from test__advanced_doc"""

        expected_wf = simple_wf()

        builder = DbBuilder(StopWordsStemmer([]))
        for doc in expected_wf.docs:
            builder.add_document(doc)

        actual_wf = WorkFlow()

        builder.build(wf=actual_wf, normalization=False)
        #workaround to handle dimensions mismatch
        expected_wf.df_vec = matrix(expected_wf.df_vec)
        assert_allclose(actual_wf.df_vec.todense(),
                        expected_wf.df_vec.todense())
        assert_allclose(actual_wf.wieghts_mat.todense(),
                        expected_wf.wieghts_mat.todense())
Пример #18
0
    def build(self, wf=None, normalization=True):
        ''' Builds DatabaseWrapper according to algorithm
            @param wf: workflow for debug purpuses
            @returns: DatabaseWrapper
        '''
        _log.info("Start building inverted index")
        _log.info("Normalization={}".format(normalization))

        _log.info("Building word index")
        #unique enumeration of words (list of words and index is a posiioin of the word in list)
        self.word_index = build_word_index(self.concepts_list)
        _log.info("Number of terms={}".format(len(self.word_index)))
        _log.info("Number of concepts={}".format(len(self.concepts_list)))

        #word => index in word_index
        index_by_word = build_index_by_words(self.word_index)

        # docs per word
        df_vec = build_df(index_by_word, self.concepts_list)
        _log.info("DF vector build is DONE")

        # weight table not normalized
        T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list)
        _log.info("ID-TDF vector build is DONE")

        if normalization:
            normalize(T)
            _log.info("Normalization is DONE")

        db = DatabaseWrapper(T, self.concepts_list, self.word_index,
                             self.stemmer)
        _log.info("Database wrapper created")

        if wf:
            wf.word_index = self.word_index
            #workaround to force returned wf to be sparse
            wf.df_vec = matrix(df_vec)
            wf.wieghts_mat = T
        return db
Пример #19
0
 def build(self, wf=None, normalization=True):
     ''' Builds DatabaseWrapper according to algorithm
         @param wf: workflow for debug purpuses
         @returns: DatabaseWrapper
     '''
     _log.info("Start building inverted index")
     _log.info("Normalization={}".format(normalization))
     
     
     _log.info("Building word index")        
     #unique enumeration of words (list of words and index is a posiioin of the word in list)
     self.word_index = build_word_index(self.concepts_list)
     _log.info("Number of terms={}".format(len(self.word_index)))
     _log.info("Number of concepts={}".format(len(self.concepts_list)))       
     
     #word => index in word_index
     index_by_word = build_index_by_words(self.word_index)
     
     # docs per word
     df_vec = build_df(index_by_word, self.concepts_list)
     _log.info("DF vector build is DONE")
                 
     # weight table not normalized
     T = build_wieght_table_dok(df_vec, index_by_word, self.concepts_list)
     _log.info("ID-TDF vector build is DONE")
             
     if normalization:
         normalize(T)
         _log.info("Normalization is DONE")
     
     db = DatabaseWrapper(T, self.concepts_list, self.word_index, self.stemmer)
     _log.info("Database wrapper created")
         
     if wf: 
         wf.word_index = self.word_index
         #workaround to force returned wf to be sparse
         wf.df_vec = matrix(df_vec)
         wf.wieghts_mat = T
     return db