def test_that_jaccard_index_is_calculated_correctly(self): """Make sure that correct jaccard index is calculated between two sets.""" # Check for single element sets. set_a = [1] set_b = [2] expected_index = 0.0 jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b) self.assertEqual(expected_index, jaccard_index) set_a = [1] set_b = [1] expected_index = 1.0 / 1.0 jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b) self.assertEqual(expected_index, jaccard_index) # Check for normal sets. set_a = [1] set_b = [1, 2] expected_index = 1.0 / 2.0 jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b) self.assertEqual(expected_index, jaccard_index) set_a = [2, 3, 4] set_b = [1, 2, 4, 6] expected_index = 2.0 / 5.0 jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b) self.assertEqual(expected_index, jaccard_index) # Check for multisets. set_a = [1, 2, 2, 3] set_b = [2, 3, 4] expected_index = 2.0 / 5.0 jaccard_index = CodeClassifier.calc_jaccard_index(set_a, set_b) self.assertEqual(expected_index, jaccard_index)
def test_that_correct_tokens_are_generated(self): """Make sure that get_tokens function returns correct tokens.""" tokens = list(CodeClassifier.get_tokens(self.program_a)) expected_tokens = [ (53, '# In Python, the code'), (54, '\n'), (53, '#'), (54, '\n'), (53, "# for letter in ['a', 'b']:"), (54, '\n'), (53, '# print letter'), (54, '\n'), (53, '#'), (54, '\n'), (53, '# prints:'), (54, '\n'), (53, '#'), (54, '\n'), (53, '# a'), (54, '\n'), (53, '# b'), (54, '\n'), (1, 'sum'), (51, '='), (2, '0'), (4, '\n'), (1, 'for'), (1, 'num'), (1, 'in'), (1, 'range'), (51, '('), (2, '1000'), (51, ')'), (51, ':'), (4, '\n'), (5, ' '), (1, 'if'), (1, 'num'), (51, '%'), (2, '3'), (51, '=='), (2, '0'), (1, 'and'), (1, 'num'), (51, '%'), (2, '5'), (51, '=='), (2, '0'), (51, ':'), (4, '\n'), (5, '\t'), (1, 'sum'), (51, '='), (1, 'sum'), (51, '+'), (1, 'num'), (4, '\n'), (6, ''), (1, 'else'), (51, ':'), (4, '\n'), (5, '\t'), (1, 'if'), (1, 'num'), (51, '%'), (2, '3'), (51, '=='), (2, '0'), (51, ':'), (4, '\n'), (5, '\t '), (1, 'sum'), (51, '='), (1, 'sum'), (51, '+'), (1, 'num'), (4, '\n'), (6, ''), (1, 'if'), (1, 'num'), (51, '%'), (2, '5'), (51, '=='), (2, '0'), (51, ':'), (4, '\n'), (5, '\t '), (1, 'sum'), (51, '='), (1, 'sum'), (51, '+'), (1, 'num'), (4, '\n'), (6, ''), (6, ''), (6, ''), (1, 'print'), (1, 'sum'), (0, '') ] self.assertListEqual(expected_tokens, tokens)
def setUp(self): super(CodeClassifierTests, self).setUp() # Example programs for testing preprocessing functions. self.program_a = ( '# In Python, the code\n#\n# for letter in [\'a\', \'b\']:\n#' ' print letter\n#\n# prints:\n#\n# a\n# ' 'b\nsum = 0\nfor num in range(1000):\n if num%3==0 and num%5==0:' '\n\tsum = sum + num\n else:\n\tif num%3 == 0:\n\t ' 'sum = sum + num\n\tif num%5 == 0:\n\t sum = sum + num\nprint sum' ) self.program_b = ( '# In Python, the code\n#\n# for letter in [\'a\', \'b\']:\n#' ' print letter\n#\n# prints:\n#\n# a\n# b\n\n' 'for num in range(1000):\n if num % 3 == 0 or num % 5 == 0:\n' ' sum += num\nprint sum') self.data = { 1: { 'source': self.program_a, 'class': 1 }, 2: { 'source': self.program_b, 'class': 2 } } self.clf = CodeClassifier.CodeClassifier()
def test_that_token_to_id_is_correct(self): """Make sure that correct token_to_id map is generated.""" token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0) expected_tokens = [ 'and', 'UNK', '%', 'for', ')', '(', '+', 'V', 'else', '==', '0', '3', '5', '1000', 'in', 'print', ':', '=', 'or', '+=', 'if' ] self.assertListEqual(token_to_id.keys(), expected_tokens)
def test_that_k_gram_hash_generator_works(self): """Make sure that k-gram hash generator function works as expected.""" token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0) id_to_token = dict(zip(token_to_id.values(), token_to_id.keys())) tokens = [ id_to_token[0], id_to_token[1], id_to_token[2], id_to_token[3], id_to_token[4], id_to_token[5] ] k_grams = winnowing.k_gram_hash_generator(tokens, token_to_id, 3) expected_k_grams = [23, 486, 949, 1412] self.assertListEqual(expected_k_grams, k_grams)
def test_that_hash_generator_works(self): """Make sure that hash generator function works as expected.""" token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0) id_to_token = dict(zip(token_to_id.values(), token_to_id.keys())) tokens = [id_to_token[0], id_to_token[1], id_to_token[2]] hash_value = winnowing.hash_generator(token_to_id, tokens) n = len(token_to_id) expected_hash_value = 0 * (n**2) + 1 * (n**1) + 2 * (n**0) self.assertEqual(hash_value, expected_hash_value)
def test_that_tokenize_for_cv_works(self): """Make sure that custom tokenizer used for CountVectorizer is working as expected.""" tokens = CodeClassifier.tokenize_for_cv(self.program_a) expected_tokens = [ 'V', '=', '0', 'for', 'V', 'in', 'V', '(', '1000', ')', ':', 'if', 'V', '%', '3', '==', '0', 'and', 'V', '%', '5', '==', '0', ':', 'V', '=', 'V', '+', 'V', 'else', ':', 'if', 'V', '%', '3', '==', '0', ':', 'V', '=', 'V', '+', 'V', 'if', 'V', '%', '5', '==', '0', ':', 'V', '=', 'V', '+', 'V', 'print', 'V' ] self.assertListEqual(tokens, expected_tokens)
def test_that_correct_fingerprints_are_obtained(self): """Make sire that fingerprint generator generates correct fingerprint. """ token_to_id = CodeClassifier.map_tokens_to_ids(self.data, 0) id_to_token = dict(zip(token_to_id.values(), token_to_id.keys())) tokens = [ id_to_token[0], id_to_token[1], id_to_token[2], id_to_token[3], id_to_token[4], id_to_token[5] ] k_grams = winnowing.k_gram_hash_generator(tokens, token_to_id, 3) fingerprint = winnowing.get_fingerprint_from_hashes(k_grams, 3) expected_fingerprint = [(486, 1), (23, 0)] self.assertListEqual(fingerprint, expected_fingerprint)