Пример #1
0
 def test_filter_tokens(self):
     t = Tokenizer()
     tokens = t.tokenize(string1)
     filtered = t.filter_tokens(tokens)
     self.assertEquals(filtered, [
         'microscopy', 'use', 'microscopes', 'see', 'micro', 'sized',
         'objects'
     ])
Пример #2
0
    def getData():
        TRAIN_DATA = []
        TRAIN_POS, TRAIN_NEG = 0, 0

        TEST_DATA = []
        TEST_POS, TEST_NEG = 0, 0

        for i in range(1, TOTAL_FILE_COUNT + 1):
            fname = get_file_name(i)
            F = Tokenizer("labelled/" + fname + ".txt")
            F.tokenize()
            F.filter_tokens()

            # F.print_tokens()

            d, p, n = F.vectorize()
            for v in d:
                if int(v['fid']) in train_files:
                    TRAIN_DATA.append(v)
                else:
                    TEST_DATA.append(v)

            if int(v['fid']) in train_files:
                TRAIN_POS += p
                TRAIN_NEG += n
            else:
                TEST_POS += p
                TEST_NEG += n

        print('Generating Train Data tokens...')
        print("Token generation completed.")
        print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive",
                                                  "Negative"))
        print('{0: <10} {1: <10} {2: <10}'.format(len(TRAIN_DATA), TRAIN_POS,
                                                  TRAIN_NEG))

        print('Generating Test Data tokens...')
        print("Token generation completed.")
        print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive",
                                                  "Negative"))
        print('{0: <10} {1: <10} {2: <10}'.format(len(TEST_DATA), TEST_POS,
                                                  TEST_NEG))

        return TRAIN_DATA, TEST_DATA
Пример #3
0
    def getData(startIndex, endIndex):
        all_data = []
        all_pos = 0
        all_neg = 0
        for i in range(startIndex, endIndex + 1):
            fname = get_file_name(i)
            F = Tokenizer("labelled/" + fname + ".txt")
            F.tokenize()
            F.filter_tokens()

            # F.print_tokens()

            d, p, n = F.vectorize()
            [all_data.append(v) for v in d]
            all_pos += p
            all_neg += n

        print("Token generation completed.")
        print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive",
                                                  "Negative"))
        print('{0: <10} {1: <10} {2: <10}'.format(len(all_data), all_pos,
                                                  all_neg))

        return all_data
Пример #4
0
 def test_tokenizer_long(self):
     t = Tokenizer()
     tokens = t.tokenize(str)
     filtered = t.filter_tokens(tokens)