Пример #1
0
 def test_term_document_matrix2(self):
     input_dataframe = self.df
     
     res_bow = bow(table=input_dataframe, input_col='words', add_words=None, no_below=1, no_above=0.8, keep_n=10000)['model']
     res = doc_term_mtx(table=input_dataframe, model=res_bow, input_col='words', result_type='term_doc_mtx')
     
     print(res['out_table'])
     
     table = res['out_table'].values.tolist()
     self.assertListEqual(table[0], ['What', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     self.assertListEqual(table[1], ['a', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     self.assertListEqual(table[2], ['life', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     self.assertListEqual(table[3], ['wonderful', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     self.assertListEqual(table[4], ['You', 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
Пример #2
0
 def test_term_document_matrix1(self):
     input_dataframe = self.df
     
     res_bow = bow(table=input_dataframe, input_col='words', add_words=None, no_below=1, no_above=0.8, keep_n=10000)['model']
     res = doc_term_mtx(table=input_dataframe, model=res_bow, input_col='words', result_type='doc_to_bow_token')
     
     print(res['out_table'])
     
     table = res['out_table'].values.tolist()
     self.assertListEqual(table[0], ['doc_0', "['(What, 1)', '(a, 1)', '(life, 1)', '(wonderful, 1)']"])
     self.assertListEqual(table[1], ['doc_1', "['(You, 1)', '(about, 1)', '(cried, 1)', '(know, 1)', '(may, 1)', '(reason, 1)', '(she, 1)', '(the, 1)', '(why, 1)']"])
     self.assertListEqual(table[2], ['doc_2', "['(I, 1)', '(like, 1)', '(stop, 1)', '(to, 1)', '(working, 1)', '(would, 1)']"])
     self.assertListEqual(table[3], ['doc_3', "['(the, 1)', '(I, 2)', '(could, 1)', '(hear, 1)', '(not, 1)', '(voice, 1)', '(wish, 1)']"])
     self.assertListEqual(table[4], ['doc_4', "['(would, 1)', '(It, 1)', '(be, 1)', '(can, 1)', '(help, 1)', '(if, 1)', '(me, 1)', '(nice, 1)', '(this, 1)', '(with, 1)', '(you, 1)']"])
Пример #3
0
    def test_bag_of_words2(self):
        input_dataframe = self.df

        res = bow(table=input_dataframe,
                  input_col='words',
                  add_words=None,
                  no_below=2,
                  no_above=0.7,
                  keep_n=10)['out_table']

        print(res)

        table = res.values.tolist()
        self.assertListEqual(table[0], ['know', 2])
        self.assertListEqual(table[1], ['the', 2])
        self.assertListEqual(table[2], ['I', 3])
        self.assertListEqual(table[3], ['would', 2])
        self.assertListEqual(table[4], ['me', 4])
Пример #4
0
    def test_bag_of_words1(self):
        input_dataframe = self.df

        res = bow(table=input_dataframe,
                  input_col='words',
                  add_words=None,
                  no_below=1,
                  no_above=0.8,
                  keep_n=10000)['out_table']

        print(res)

        table = res.values.tolist()
        self.assertListEqual(table[0], ['What', 1])
        self.assertListEqual(table[1], ['a', 1])
        self.assertListEqual(table[7], ['know', 2])
        self.assertListEqual(table[13], ['I', 3])
        self.assertListEqual(table[14], ['like', 1])
Пример #5
0
    def test_document_document_matrix1(self):
        input_dataframe = self.df

        res_bow = bow(table=input_dataframe,
                      input_col='words',
                      add_words=None,
                      no_below=1,
                      no_above=0.8,
                      keep_n=10000)['model']
        res = doc_doc_mtx(table=input_dataframe,
                          model=res_bow,
                          input_col='words',
                          result_type='sparse')

        print(res['out_table'])

        table = res['out_table'].values.tolist()
        self.assertListEqual(table[0], [1, 3, 1])
        self.assertListEqual(table[1], [1, 5, 1])
        self.assertListEqual(table[2], [2, 4, 1])
        self.assertListEqual(table[3], [2, 6, 1])
        self.assertListEqual(table[4], [2, 3, 1])
Пример #6
0
    def test_document_document_matrix2(self):
        input_dataframe = self.df

        res_bow = bow(table=input_dataframe,
                      input_col='words',
                      add_words=None,
                      no_below=1,
                      no_above=0.8,
                      keep_n=10000)['model']
        res = doc_doc_mtx(table=input_dataframe,
                          model=res_bow,
                          input_col='words',
                          result_type='dense')

        print(res['out_table'])

        table = res['out_table'].values.tolist()
        self.assertListEqual(table[0], ['doc_0', 4, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        self.assertListEqual(table[1], ['doc_1', 0, 9, 0, 1, 0, 1, 0, 0, 0, 0])
        self.assertListEqual(table[2], ['doc_2', 0, 0, 6, 1, 1, 0, 1, 0, 0, 0])
        self.assertListEqual(table[3], ['doc_3', 0, 1, 1, 7, 0, 0, 1, 0, 0, 0])
        self.assertListEqual(table[4],
                             ['doc_4', 0, 0, 1, 0, 11, 2, 1, 2, 0, 1])