def test2_single_full_match(self): pattern = b"hello" text = b"how delightful, hello there" matches = gst.match(pattern, '', text, '', len(pattern)) self.assertIsInstance(matches, list) self.assertEqual(len(matches), 1) self.assertCorrectMatchSubstringMapping(pattern, text, matches[0]) pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii") matches = gst.match(pattern_str, '', text_str, '', len(pattern_str)) self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])
def test4_no_match(self): pattern = b"hello" text = b"go away, you nuisance" matches = gst.match(pattern, '', text, '', len(pattern)) self.assertIsInstance(matches, list) self.assertEqual(len(matches), 0) pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii") matches = gst.match(pattern_str, '', text_str, '', len(pattern_str)) self.assertIsInstance(matches, list) self.assertEqual(len(matches), 0)
def test3_single_partial_match(self): pattern = b"hello" text = b"we are in helsinki now" match_len = 3 matches = gst.match(pattern, '', text, '', match_len) self.assertIsInstance(matches, list) self.assertEqual(len(matches), 1) self.assertCorrectMatchSubstringMapping(pattern, text, matches[0]) pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii") matches = gst.match(pattern_str, '', text_str, '', match_len) self.assertIsInstance(matches, list) self.assertEqual(len(matches), 1) self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])
def test2_full_match_bytes(self, text_and_pattern): text, pattern = text_and_pattern pattern_bytes, text_bytes = pattern.encode("ascii"), text.encode( "ascii") matches = gst.match(pattern_bytes, '', text_bytes, '', len(pattern_bytes)) for match in matches: self.assertCorrectMatchSubstringMapping(pattern, text, match)
def compare_files(filename_one, filename_two): """ Receives filenames as parameters, compares the list of lines received Returns the tuple containing the plagiarism percentage """ if utils.check_file( utils.get_file_path(filename_one)) and utils.check_file( utils.get_file_path(filename_two)): file_one_tokens, file_two_tokens = [], [] file_one_clean, file_two_clean = utils.extract_files( filename_one, filename_two, True) print(file_one_clean) print(file_two_clean) if filename_one.split(".")[1] == "py": write_to_file(file_one_clean, "1") write_to_file(file_two_clean, "2") token_generator('./dataset/type_two_dump_1.py', file_one_tokens) token_generator('./dataset/type_two_dump_2.py', file_two_tokens) # print("yyyyyyyyyyy") else: file_one_tokens = c_token_generator(filename_one) file_two_tokens = c_token_generator(filename_two) # print(file_one_tokens) # print(file_two_tokens) ignore_a = '' string_a = ",".join(str(e) for e in file_one_tokens) string_b = ",".join(str(e) for e in file_two_tokens) number_of_lines_plagiarised = list( match(string_a, ignore_a, string_b, ignore_a, 6)[0])[2] # print(string_a) # print(string_b) # print(match(string_a, ignore_a, string_b, ignore_a, 3)[0]) file_two_plagiarism_score = utils.get_plagiarism_percentage( number_of_lines_plagiarised * 2, len(string_a) + len(string_b)) return file_two_plagiarism_score, file_two_plagiarism_score # return type_one.compare_files('type_two_dump_1.py', 'type_two_dump_2.py') else: print("Error file format not supported")
def test_edge_cases(self): for a, b in EDGE_CASES: matches = gst.match(a, '', b, '', 1) self.assertGreater(len(matches), 0)
def test1_full_match(self, text_and_pattern): text, pattern = text_and_pattern matches = gst.match(pattern, '', text, '', len(pattern)) for match in matches: self.assertCorrectMatchSubstringMapping(pattern, text, match)
# } Char = collections.namedtuple("Char", ("char", "match_edge", "match_number")) output_html_path = "index.html" min_match_length = 50 num_match_colors = 4 # Load data with open("text1.txt") as f: text1 = f.read() text2 = '' for line in text1.splitlines(): text2 += ' '.join(l for l in line.split(' ') if random.random() > 0.05) # Get matches and sort in ascending order by starting indexes of matches in text1 match_list = gst.match(text1, '', text2, '', min_match_length) match_list.sort() # Annotate every character that is the first or last character of a match # This makes it trivial to generate HTML around these positions text1 = [Char(char=c, match_edge=0, match_number=0) for c in text1] text2 = [Char(char=c, match_edge=0, match_number=0) for c in text2] for match_num, (i1, i2, length) in enumerate(match_list, start=1): text1[i1] = Char(char=text1[i1].char, match_edge=-1, match_number=match_num) text2[i2] = Char(char=text2[i2].char, match_edge=-1, match_number=match_num) text1[i1+length-1] = Char(char=text1[i1+length-1].char, match_edge=1, match_number=match_num) text2[i2+length-1] = Char(char=text2[i2+length-1].char, match_edge=1, match_number=match_num) template = jinja2.Environment(loader=jinja2.FileSystemLoader(".")).get_template("template.html") html = template.render( text1=text1,