예제 #1
0
    def test_that_jaccard_index_is_calculated_correctly(self):
        """Make sure that correct jaccard index is calculated between
        two sets."""
        # Check for single element sets.
        set_a = [1]
        set_b = [2]
        expected_index = 0.0
        jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b)
        self.assertEqual(expected_index, jaccard_index)

        set_a = [1]
        set_b = [1]
        expected_index = 1.0 / 1.0
        jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b)
        self.assertEqual(expected_index, jaccard_index)

        # Check for normal sets.
        set_a = [1]
        set_b = [1, 2]
        expected_index = 1.0 / 2.0
        jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b)
        self.assertEqual(expected_index, jaccard_index)

        set_a = [2, 3, 4]
        set_b = [1, 2, 4, 6]
        expected_index = 2.0 / 5.0
        jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b)
        self.assertEqual(expected_index, jaccard_index)

        # Check for multisets.
        set_a = [1, 2, 2, 3]
        set_b = [2, 3, 4]
        expected_index = 2.0 / 5.0
        jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b)
        self.assertEqual(expected_index, jaccard_index)
예제 #2
0
    def test_that_correct_tokens_are_generated(self):
        """Make sure that get_tokens function returns correct tokens."""
        tokens = list(CodeClassifier.get_tokens(self.program_a))
        expected_tokens = [
            (53, '# In Python, the code'), (54, '\n'), (53, '#'), (54, '\n'),
            (53, "#     for letter in ['a', 'b']:"), (54, '\n'),
            (53, '#         print letter'), (54, '\n'), (53, '#'), (54, '\n'),
            (53, '# prints:'), (54, '\n'), (53, '#'), (54, '\n'),
            (53, '#     a'), (54, '\n'),
            (53, '#     b'), (54, '\n'), (1, 'sum'), (51, '='), (2, '0'),
            (4, '\n'), (1, 'for'), (1, 'num'), (1, 'in'), (1, 'range'),
            (51, '('), (2, '1000'), (51, ')'), (51, ':'), (4, '\n'), (5, '  '),
            (1, 'if'), (1, 'num'), (51, '%'), (2, '3'), (51, '=='), (2, '0'),
            (1, 'and'), (1, 'num'), (51, '%'), (2, '5'), (51, '=='), (2, '0'),
            (51, ':'), (4, '\n'), (5, '\t'), (1, 'sum'), (51, '='), (1, 'sum'),
            (51, '+'), (1, 'num'), (4, '\n'), (6, ''), (1, 'else'), (51, ':'),
            (4, '\n'), (5, '\t'), (1, 'if'), (1, 'num'), (51, '%'), (2, '3'),
            (51, '=='), (2, '0'), (51, ':'), (4, '\n'),
            (5, '\t  '), (1, 'sum'), (51, '='), (1, 'sum'), (51, '+'),
            (1, 'num'), (4, '\n'), (6, ''), (1, 'if'), (1, 'num'), (51, '%'),
            (2, '5'), (51, '=='), (2, '0'), (51, ':'), (4, '\n'), (5, '\t  '),
            (1, 'sum'), (51, '='), (1, 'sum'), (51, '+'), (1, 'num'),
            (4, '\n'), (6, ''), (6, ''), (6, ''), (1, 'print'), (1, 'sum'),
            (0, '')
        ]

        self.assertListEqual(expected_tokens, tokens)
예제 #3
0
    def setUp(self):
        super(CodeClassifierTests, self).setUp()
        # Example programs for testing preprocessing functions.
        self.program_a = (
            '# In Python, the code\n#\n#     for letter in [\'a\', \'b\']:\n#'
            '         print letter\n#\n# prints:\n#\n#     a\n#     '
            'b\nsum = 0\nfor num in range(1000):\n  if num%3==0 and num%5==0:'
            '\n\tsum = sum + num\n  else:\n\tif num%3 == 0:\n\t  '
            'sum = sum + num\n\tif num%5 == 0:\n\t  sum = sum + num\nprint sum'
        )

        self.program_b = (
            '# In Python, the code\n#\n#     for letter in [\'a\', \'b\']:\n#'
            '         print letter\n#\n# prints:\n#\n#     a\n#     b\n\n'
            'for num in range(1000):\n  if num % 3 == 0 or num % 5 == 0:\n'
            '    sum += num\nprint sum')

        self.data = {
            1: {
                'source': self.program_a,
                'class': 1
            },
            2: {
                'source': self.program_b,
                'class': 2
            }
        }

        self.clf = CodeClassifier.CodeClassifier()
예제 #4
0
 def test_that_token_to_id_is_correct(self):
     """Make sure that correct token_to_id map is generated."""
     token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0)
     expected_tokens = [
         'and', 'UNK', '%', 'for', ')', '(', '+', 'V', 'else', '==', '0',
         '3', '5', '1000', 'in', 'print', ':', '=', 'or', '+=', 'if'
     ]
     self.assertListEqual(token_to_id.keys(), expected_tokens)
예제 #5
0
 def test_that_k_gram_hash_generator_works(self):
     """Make sure that k-gram hash generator function works as expected."""
     token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0)
     id_to_token = dict(zip(token_to_id.values(), token_to_id.keys()))
     tokens = [
         id_to_token[0], id_to_token[1], id_to_token[2], id_to_token[3],
         id_to_token[4], id_to_token[5]
     ]
     k_grams = winnowing.k_gram_hash_generator(tokens, token_to_id, 3)
     expected_k_grams = [23, 486, 949, 1412]
     self.assertListEqual(expected_k_grams, k_grams)
예제 #6
0
    def test_that_hash_generator_works(self):
        """Make sure that hash generator function works as expected."""
        token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0)
        id_to_token = dict(zip(token_to_id.values(), token_to_id.keys()))
        tokens = [id_to_token[0], id_to_token[1], id_to_token[2]]
        hash_value = winnowing.hash_generator(token_to_id, tokens)

        n = len(token_to_id)
        expected_hash_value = 0 * (n**2) + 1 * (n**1) + 2 * (n**0)

        self.assertEqual(hash_value, expected_hash_value)
예제 #7
0
    def test_that_tokenize_for_cv_works(self):
        """Make sure that custom tokenizer used for CountVectorizer is
        working as expected."""
        tokens = CodeClassifier.tokenize_for_cv(self.program_a)
        expected_tokens = [
            'V', '=', '0', 'for', 'V', 'in', 'V', '(', '1000', ')', ':', 'if',
            'V', '%', '3', '==', '0', 'and', 'V', '%', '5', '==', '0', ':',
            'V', '=', 'V', '+', 'V', 'else', ':', 'if', 'V', '%', '3', '==',
            '0', ':', 'V', '=', 'V', '+', 'V', 'if', 'V', '%', '5', '==', '0',
            ':', 'V', '=', 'V', '+', 'V', 'print', 'V'
        ]

        self.assertListEqual(tokens, expected_tokens)
예제 #8
0
 def test_that_correct_fingerprints_are_obtained(self):
     """Make sire that fingerprint generator generates correct fingerprint.
     """
     token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0)
     id_to_token = dict(zip(token_to_id.values(), token_to_id.keys()))
     tokens = [
         id_to_token[0], id_to_token[1], id_to_token[2], id_to_token[3],
         id_to_token[4], id_to_token[5]
     ]
     k_grams = winnowing.k_gram_hash_generator(tokens, token_to_id, 3)
     fingerprint = winnowing.get_fingerprint_from_hashes(k_grams, 3)
     expected_fingerprint = [(486, 1), (23, 0)]
     self.assertListEqual(fingerprint, expected_fingerprint)