예제 #1
0
    def test2_single_full_match(self):
        pattern = b"hello"
        text = b"how delightful, hello there"
        matches = gst.match(pattern, '', text, '', len(pattern))
        self.assertIsInstance(matches, list)
        self.assertEqual(len(matches), 1)
        self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])

        pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii")
        matches = gst.match(pattern_str, '', text_str, '', len(pattern_str))
        self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])
예제 #2
0
    def test4_no_match(self):
        pattern = b"hello"
        text = b"go away, you nuisance"
        matches = gst.match(pattern, '', text, '', len(pattern))
        self.assertIsInstance(matches, list)
        self.assertEqual(len(matches), 0)

        pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii")
        matches = gst.match(pattern_str, '', text_str, '', len(pattern_str))
        self.assertIsInstance(matches, list)
        self.assertEqual(len(matches), 0)
예제 #3
0
    def test3_single_partial_match(self):
        pattern = b"hello"
        text = b"we are in helsinki now"
        match_len = 3
        matches = gst.match(pattern, '', text, '', match_len)
        self.assertIsInstance(matches, list)
        self.assertEqual(len(matches), 1)
        self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])

        pattern_str, text_str = pattern.decode("ascii"), text.decode("ascii")
        matches = gst.match(pattern_str, '', text_str, '', match_len)
        self.assertIsInstance(matches, list)
        self.assertEqual(len(matches), 1)
        self.assertCorrectMatchSubstringMapping(pattern, text, matches[0])
예제 #4
0
 def test2_full_match_bytes(self, text_and_pattern):
     text, pattern = text_and_pattern
     pattern_bytes, text_bytes = pattern.encode("ascii"), text.encode(
         "ascii")
     matches = gst.match(pattern_bytes, '', text_bytes, '',
                         len(pattern_bytes))
     for match in matches:
         self.assertCorrectMatchSubstringMapping(pattern, text, match)
예제 #5
0
def compare_files(filename_one, filename_two):
    """ 
        Receives filenames as parameters, compares the list of lines received
		Returns the tuple containing the plagiarism percentage
    """
    if utils.check_file(
            utils.get_file_path(filename_one)) and utils.check_file(
                utils.get_file_path(filename_two)):
        file_one_tokens, file_two_tokens = [], []
        file_one_clean, file_two_clean = utils.extract_files(
            filename_one, filename_two, True)
        print(file_one_clean)
        print(file_two_clean)
        if filename_one.split(".")[1] == "py":
            write_to_file(file_one_clean, "1")
            write_to_file(file_two_clean, "2")
            token_generator('./dataset/type_two_dump_1.py', file_one_tokens)
            token_generator('./dataset/type_two_dump_2.py', file_two_tokens)
            # print("yyyyyyyyyyy")
        else:
            file_one_tokens = c_token_generator(filename_one)
            file_two_tokens = c_token_generator(filename_two)
        # print(file_one_tokens)
        # print(file_two_tokens)
        ignore_a = ''
        string_a = ",".join(str(e) for e in file_one_tokens)
        string_b = ",".join(str(e) for e in file_two_tokens)

        number_of_lines_plagiarised = list(
            match(string_a, ignore_a, string_b, ignore_a, 6)[0])[2]

        # print(string_a)
        # print(string_b)
        # print(match(string_a, ignore_a, string_b, ignore_a, 3)[0])
        file_two_plagiarism_score = utils.get_plagiarism_percentage(
            number_of_lines_plagiarised * 2,
            len(string_a) + len(string_b))
        return file_two_plagiarism_score, file_two_plagiarism_score
        # return type_one.compare_files('type_two_dump_1.py', 'type_two_dump_2.py')

    else:
        print("Error file format not supported")
예제 #6
0
 def test_edge_cases(self):
     for a, b in EDGE_CASES:
         matches = gst.match(a, '', b, '', 1)
         self.assertGreater(len(matches), 0)
예제 #7
0
 def test1_full_match(self, text_and_pattern):
     text, pattern = text_and_pattern
     matches = gst.match(pattern, '', text, '', len(pattern))
     for match in matches:
         self.assertCorrectMatchSubstringMapping(pattern, text, match)
예제 #8
0
# }
Char = collections.namedtuple("Char", ("char", "match_edge", "match_number"))

output_html_path = "index.html"
min_match_length = 50
num_match_colors = 4

# Load data
with open("text1.txt") as f:
    text1 = f.read()
text2 = ''
for line in text1.splitlines():
    text2 += ' '.join(l for l in line.split(' ') if random.random() > 0.05)

# Get matches and sort in ascending order by starting indexes of matches in text1
match_list = gst.match(text1, '', text2, '', min_match_length)
match_list.sort()

# Annotate every character that is the first or last character of a match
# This makes it trivial to generate HTML around these positions
text1 = [Char(char=c, match_edge=0, match_number=0) for c in text1]
text2 = [Char(char=c, match_edge=0, match_number=0) for c in text2]
for match_num, (i1, i2, length) in enumerate(match_list, start=1):
    text1[i1] = Char(char=text1[i1].char, match_edge=-1, match_number=match_num)
    text2[i2] = Char(char=text2[i2].char, match_edge=-1, match_number=match_num)
    text1[i1+length-1] = Char(char=text1[i1+length-1].char, match_edge=1, match_number=match_num)
    text2[i2+length-1] = Char(char=text2[i2+length-1].char, match_edge=1, match_number=match_num)

template = jinja2.Environment(loader=jinja2.FileSystemLoader(".")).get_template("template.html")
html = template.render(
        text1=text1,