def find_prefix_suffix(list_seg):
    """
    Find common prefix and suffix in list of files
    :param list_seg: list of filenames to analyse
    :return: longest prefix and suffix
    """
    comp_s = SequenceMatcher()
    initial = list_seg[0]
    prefix_fin = None
    suffix_fin = None
    for i in range(1, len(list_seg)):
        comp_s.set_seqs(initial, list_seg[i])
        all_poss = comp_s.get_matching_blocks()
        if all_poss[0].a == 0:
            prefix = initial[0:all_poss[0].size]
        else:
            prefix = ''
        comp_pre = SequenceMatcher()
        if prefix_fin is None:
            prefix_fin = prefix
        comp_pre.set_seqs(prefix, prefix_fin)
        pre_poss = comp_pre.get_matching_blocks()

        prefix_fin = prefix[0:pre_poss[0].size]
        if all_poss[-1].size == 0:
            suffix = initial[all_poss[-2].a:all_poss[-2].a + all_poss[-2].size]
        else:
            suffix = initial[all_poss[-1].a:]
        comp_suf = SequenceMatcher()
        if suffix_fin is None:
            suffix_fin = suffix
        comp_suf.set_seqs(suffix, suffix_fin)
        suf_poss = comp_suf.get_matching_blocks()
        suffix_fin = suffix[suf_poss[-2].a:]
    return prefix_fin, suffix_fin
示例#2
0
    def testCDifflibWithBug5Data(self):
        """Check cdifflib returns the same result for bug #5
           (autojunk handling issues)"""
        from . import testdata

        # note: convert both to lists for Python 3.3
        sm = SequenceMatcher(None, testdata.a5, testdata.b5)
        difflib_matches = list(sm.get_matching_blocks())

        sm = CSequenceMatcher(None, testdata.a5, testdata.b5)
        cdifflib_matches = list(sm.get_matching_blocks())

        self.assertEqual(difflib_matches, cdifflib_matches)
示例#3
0
    def testCDifflibWithBug5Data(self):
        """Check cdifflib returns the same result for bug #5
           (autojunk handling issues)"""
        from . import testdata

        # note: convert both to lists for Python 3.3
        sm = SequenceMatcher(None, testdata.a5, testdata.b5)
        difflib_matches = list(sm.get_matching_blocks())

        sm = CSequenceMatcher(None, testdata.a5, testdata.b5)
        cdifflib_matches = list(sm.get_matching_blocks())

        self.assertEqual(difflib_matches, cdifflib_matches)
示例#4
0
    def process(self, response):
        """
        Process data
        :return: str
        """

        if response.status in self.DEFAULT_STATUSES:
            super().process(response)
            length = self.__get_content_length()
            if self.MIN_CONTENT_LENGTH < length:
                # the page is allowed for comparison

                if not self.previous_item:
                    # 1st match. Push items for next compare step
                    self.previous_item.update({
                        'length': length,
                        'text': self._body
                    })
                    return None
                else:
                    if length == self.previous_item.get(
                            'length') and self.MIN_CONTENT_LENGTH < length:
                        # identical, seems to drop failed for success
                        return self.RESPONSE_INDEX
                    else:
                        matcher = SequenceMatcher(a=self.previous_item['text'],
                                                  b=self._body)
                        matcher.get_matching_blocks()

                        if 'length' in self.current_item:
                            next_matcher = SequenceMatcher(
                                a=self.current_item['text'], b=self._body)
                            if next_matcher.ratio() == matcher.ratio():
                                return self.RESPONSE_INDEX
                        if self.MIN_RATIO_INDEX < matcher.ratio():
                            return self.RESPONSE_INDEX
                        else:
                            self.current_item.update({
                                'length': length,
                                'text': self._body
                            })

                    if self.MIN_CONTENT_LENGTH < length:
                        self.previous_item.update({
                            'length': length,
                            'text': self._body
                        })
        return None
示例#5
0
def align_strings(str1, str2, max_lenght=0):
    from difflib import SequenceMatcher

    sm = SequenceMatcher(lambda x: x in " ")
    sm.set_seqs(str1, str2)

    # While there are matches
    # Rem: the last block is a dummy one, see doc of SequenceMatcher
    while len(sm.get_matching_blocks()) > 1:
        for m in sm.get_matching_blocks():
            # If str1 and str2 are not aligned
            if m[0] != m[1]:
                if m[0] < m[1]:
                    str1 = str1[:m[0]]+" "*(m[1]-m[0])+str1[m[0]:]
                if m[1] < m[0]:
                    str2 = str2[:m[1]]+" "*(m[0]-m[1])+str2[m[1]:]
                sm.set_seqs(str1, str2)
                break
        else:
            # If all the blocks are for aligned texts
            break

    # Padding at the end of str so that both are same size
    if len(str1)<len(str2):
        str1 += " "*(len(str2)-len(str1)-1)
    if len(str2)<len(str1):
        str2 += " "*(len(str1)-len(str2)-1)

    # If we want to split in multiple lines
    if max_lenght!=0:
        ret_str1=[]
        ret_str2=[]
        # Add padding
        #str1 += " "*(len(str1)%max_lenght)
        #str2 += " "*(len(str2)%max_lenght)

        while len(str1) > max_lenght:
            ret_str1 += [str1[:max_lenght]]
            str1 = str1[max_lenght:]
            ret_str2 += [str2[:max_lenght]]
            str2 = str2[max_lenght:]

        ret_str1 += [str1]
        ret_str2 += [str2]

        return ret_str1, ret_str2

    return str1, str2
示例#6
0
def match_question_blocks(question_obj_list):
    clues = []
    sas = []
    clue_sents = set()

    # get clue and clue sentences in a pairewise manner
    for i in range(len(question_obj_list)):
        for j in range(i + 1, len(question_obj_list)):
            q1, q2 = clean_question(question_obj_list[i]["text"]), \
                     clean_question(question_obj_list[j]["text"])
            question_matcher = SequenceMatcher(None, q1, q2)
            matching_blocks = question_matcher.get_matching_blocks()
            longest_match = sorted(matching_blocks,
                                   key=lambda k: k.size,
                                   reverse=True)[0]
            if longest_match.size > 10:
                sa, sb, span = longest_match.a, longest_match.b, longest_match.size
                # print(i, j, longest_match, "q1: ", q1[sa:sa + span],
                #       "q2: ", q2[sb:sb + span])
                clue = q1[sa:sa + span]
                complete_clue = get_complete_clue(sa, sa + span, q1)
                if clue != '':
                    clues.append(complete_clue.strip())
                    sas.append(sa)
                    clue_sent = clean_question(
                        get_clue_sent(question_obj_list[i], sa))
                    if len(clue_sent) > 1:
                        clue_sents.add(clue_sent.strip())

    # deduplicate clues
    if len(clues) > 1:
        clues = deduplicate_clues(clues)
    return clues, list(clue_sents)
示例#7
0
def align(sent):
    tagged = re.split(r'\s+', sent)
    raw_word = tagged[0]
    tagged[1] = re.compile(r'__[0-9]+').sub('', tagged[1])
    tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tagged[1])
    tagged = ''.join([morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph])
    fraction = list()
    for morph_tag in tag_morph:
        morph, tag = nltk.str2tuple(morph_tag)

        for i, syl in enumerate(morph):
            if i == 0:
                fraction.append([syl, "B-"+tag])
            else:
                fraction.append([syl, "I-" + tag])
        fraction[-1][1] = fraction[-1][1] + "+"  ##태그 뒤에 +붙이기
    fraction[-1][1] = fraction[-1][1][:-1]
    print(raw_word,tagged)
    if raw_word == tagged:
        return fraction
    SM = SequenceMatcher(None, raw_word, tagged)
    blocks = list()
    if include_delete(SM):
        blocks = make_del_block(fraction, raw_word, tagged)
    else:
        mat_blocks = SM.get_matching_blocks()
        blocks = generate_block(fraction, mat_blocks)
        if len(mat_blocks) == 1:# 온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태.
            blocks = make_del_block(fraction, raw_word, tagged)

    print(blocks)
    for cur, nxt in pairwise(blocks):
        raw = raw_word[cur[0]:cur[1]]
        mor = tagged[cur[2]:cur[3]]
        print(raw,mor)
示例#8
0
def longest_substring(str1, str2=None, min_match_len=2):
    if str2 is None:
        return str1

    if len(str1) == 0 or len(str2) == 0:
        return str1 if len(str2) == 0 else str2

    list1 = str1.split(" ")
    list2 = str2.split(" ")
    # print (list1, list2)

    # initialize SequenceMatcher object with
    # input string
    seq_matcher = SequenceMatcher(None, list1, list2)

    matching_blocks = seq_matcher.get_matching_blocks()

    if len(matching_blocks) == 0:
        raise ValueError(
            "No matched substrings found, in str1: \"%s\", str2: \"%s\"" %
            (str1, str2))

    # return "".join([str1[match.a: match.a + match.size] for match in matching_blocks if match.size >= min_match_len])
    return " ".join([
        list1[match.a + i] for match in matching_blocks
        for i in range(match.size)
    ])
def partial_ratio(s1, s2):
    """
    Return the ratio of the most similar substring
    as a number between 0 and 100.
    """

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer, autojunk=False)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for (short_start, long_start, _) in blocks:
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr, autojunk=False)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return max(scores) * 100.0
示例#10
0
    def _find_matched_module_config(xml_dir_name, file_name):
        """Find the matched module config.

        e.g. wp7603.xml -> wp76xx.xml
        """
        all_modules = os.listdir(xml_dir_name)
        matched_prefix_array = []
        for module_name in all_modules:
            sequence_matcher = SequenceMatcher(None, file_name, module_name)
            all_matches = sequence_matcher.get_matching_blocks()
            prefix_match_length = 0
            for _match in all_matches:
                if _match.a == 0 and _match.b == 0:
                    prefix_match_length = _match.size
            matched_prefix_array.append(prefix_match_length)
        max_matched = max(matched_prefix_array)
        resolved_xml_file = None
        if max_matched >= 4:
            # Match at least four digits prefix. WPXX, ARXX, etc.
            # e.g. wp7604 will match wp76xx.xml
            xml_file_name_index = matched_prefix_array.index(max_matched)
            candidate_xml_file_name = all_modules[xml_file_name_index]
            resolved_xml_file = os.path.join(xml_dir_name,
                                             candidate_xml_file_name)
            swilog.warning("{} will be used!".format(resolved_xml_file))
        return resolved_xml_file
  def on_pre_save(self, view):
    settings = sublime.load_settings('Preferences.sublime-settings')
    patterns = settings.get("trim_if_present", [])

    # Trim all whitespace on my files.
    if any(view.find(pattern, 0, sublime.IGNORECASE) for pattern in patterns):
      view.run_command("erase_whitespace", {})
      return

    if view.id() not in snapshots:
      print("No snapshot present to compare")
      return

    # Trim whitespace on any new files.
    old = snapshots[view.id()].split('\n')
    new = view.substr(sublime.Region(0, view.size())).split('\n')
    # Remove the line numbers that were present before.
    new_lines = set(range(len(new)))
    sm = SequenceMatcher(None, old, new)
    for i, j, n in sm.get_matching_blocks():
      for k in range(j, j + n):
        new_lines.remove(k)
    # Trim the whitespace on the new lines:
    if new_lines:
      new_lines = ','.join(str(n) for n in new_lines)
      view.run_command("process_new_lines", dict(new_lines=new_lines))
def main(iob_file: Path):
    sentences = []
    with iob_file.open() as f:
        for line in f:
            match = SENT_PATTERN.match(line)
            if match is not None:
                sentence: str = line[match.end():].strip()
                sentences.append(sentence)

    nlp = Hungarian()

    # noinspection PyUnresolvedReferences
    from huspacy.components import HunSentencizer
    nlp.add_pipe("hun_sentencizer")

    doc: Doc = nlp(" ".join(sentences))
    predicted_sents = [str(s) + "\n" for s in doc.sents]
    sentences = [s + "\n" for s in sentences]

    seqmatcher = SequenceMatcher(None, sentences, predicted_sents)
    accuracy = sum(mb.size
                   for mb in seqmatcher.get_matching_blocks()) / len(sentences)
    print(f"Accuracy: {accuracy:.2%}\n\n")

    diffs = list(
        context_diff(sentences,
                     predicted_sents,
                     fromfile="gold",
                     tofile="predicted",
                     n=0))
    sys.stdout.writelines(diffs)
示例#13
0
    def getScore(self, str1, str2, limit=0.8):

        score_diff = 1
        if (str1 != str2):
            str1_score = self.getScore(str1, str1, self.limit)[2]
            str2_score = self.getScore(str2, str2, self.limit)[2]
            score_diff = abs(str1_score -
                             str2_score) if abs(str1_score -
                                                str2_score) != 0 else 1

        interests = str1.split(" ")
        keywords = str2.split(" ")

        s = SequenceMatcher(None)
        seq_score = 0
        nb_match = 0
        score = 0
        for interest in interests:
            s.set_seq2(interest)
            for keyword in keywords:
                s.set_seq1(keyword)
                b = s.ratio() >= self.limit and len(
                    s.get_matching_blocks()) == 2
                seq_score += s.ratio()
                if b:
                    nb_match += 1
        score = math.pow(nb_match, 5) * seq_score
        similarity = round(score * nb_match / score_diff)
        is_similar = similarity >= 1

        return (seq_score, nb_match, score, score_diff, similarity, is_similar)
示例#14
0
文件: tweets.py 项目: maweki/lvbstats
def get_match(haystack, needle):
    if len(haystack.strip()) < len(needle):
        return None
    from difflib import SequenceMatcher
    matcher = SequenceMatcher(a=needle, b=haystack)
    matching_acc = 0
    for block in matcher.get_matching_blocks():
        _, _, match_count = block
        matching_acc += match_count

    if matching_acc > 0.9 * len(needle):
        for block in matcher.get_matching_blocks():
            needle_idx, _, match_count = block
            if needle_idx == 0 and match_count > 0:
                return needle[:match_count]
    return None
示例#15
0
def similar(string: str, sub: str) -> float:
    sm = SequenceMatcher(None, string, sub)
    matchs = sm.get_matching_blocks()
    size = 0
    for match in matchs:
        size += match.size
    return size / len(sub)
示例#16
0
def partial_ratio(s1,  s2):

    if s1 is None: raise TypeError("s1 is None")
    if s2 is None: raise TypeError("s2 is None")

    if len(s1) <= len(s2):
        shorter = s1; longer = s2;
    else:
        shorter = s2; longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:

        long_start   = block[1] - block[0]
        long_end     = long_start + len(shorter)
        long_substr  = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995: return 100
        else: scores.append(r)

    return int(100 * max(scores))
示例#17
0
def partial_ratio(s1, s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""
    s1, s2 = utils.make_type_consistent(s1, s2)

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return utils.intr(100 * max(scores))
示例#18
0
def get_align_indexes(seqmatch: SequenceMatcher):
    """Get indexes for matching and nonmatching parts of two token tuples (from SequenceMatcher)."""
    class MatchIndexes(object):
        """Start/end indexes for a matching block of sequences a and b, with a match indicator."""
        def __init__(self, a_i: int, a_j: int, b_i: int, b_j: int,
                     match: bool):
            """[ab]i: Start index, [ab]j: End index, match: Is this a matching tuple or not?"""
            self.ai, self.aj = a_i, a_j
            self.bi, self.bj = b_i, b_j
            self.match = match

        def __repr__(self):
            attr_reprs = [
                f'{k}: {v}' for k, v in self.__dict__.items()
                if not k.startswith('__')
            ]
            return f'MatchIndexes({", ".join(attr_reprs)})'

    matchblocks = seqmatch.get_matching_blocks()
    align_indexes = []
    for mpair in zip(matchblocks, matchblocks[1:]):
        ai = mpair[0].a  # Indexes from the a side
        aj = ai + mpair[0].size
        ak = mpair[1].a
        bi = mpair[0].b  # Indexes from the b side
        bj = bi + mpair[0].size
        bk = mpair[1].b
        align_indexes.append(MatchIndexes(ai, aj, bi, bj, match=True))
        align_indexes.append(MatchIndexes(aj, ak, bj, bk, match=False))
    # Fill in any missing mismatches at the beginning
    if align_indexes[0].ai > 0 or align_indexes[0].bi > 0:
        new_aj, new_bj = align_indexes[0].ai, align_indexes[0].bi
        align_indexes = [MatchIndexes(0, new_aj, 0, new_bj, match=False)
                         ] + align_indexes
    return align_indexes
 def match(self):
     sequence = SequenceMatcher(None, self.textAgrams, self.textBgrams)
     matchingBlocks = sequence.get_matching_blocks()
     highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
     numBlocks = len(highMatchingBlocks)
     report.write('Number of sentences quoted = %s ' % numBlocks)
     report.write('\n\n\n')
示例#20
0
def s(s1, s2):
    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")
    if len(s1) == 0 or len(s2) == 0 or len(s1) > len(s2):
        return
    shorter = s1
    longer = s2
    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()
    #print blocks
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]
        # print long_substr
        m2 = SequenceMatcher(None, shorter, long_substr)
        if m2.ratio()>0.8:
            print shorter + "  :  " + long_substr
            print m2.ratio()
            print
            return 1
        #r = m2.ratio()
        # print r
        #scores.append(r)
    return 0;
示例#21
0
def substrings_en_comun(str1, str2, longitud_min=10):
    """
    Encuentra los *substrings*, o cadena de caracteres internas, que \
    tienen en común dos textos de entrada y cumplen con una longitud \
    mínima.

    :param str1: (str) Primer texto de insumo.
    :param str2: (str) Segundo texto de insumo.
    :param longitud_min: (int) Cantidad mínima de caracteres que debe \
        tener una coincidencia entre los dos textos de entrada, para \
        ser considerada.
    :return: (list) Lista de *substrings* o cadenas de caracteres en \
        común que cumplan con el requisito de longitud mínima. Si no \
        hay ningúna cadena de caracteres que cumpla esta condición, \
        se devuelve una lista vacía.
    """
    # Inicializar objeto de SequenceMatcher con los dos textos
    seqMatch = SequenceMatcher(None, str1, str2)
    # Hallar el sub-string común de mayor longitud
    # Cada elemento tiene la forma de Match(a=0, b=0, size=5)
    coincidencias = seqMatch.get_matching_blocks()
    # Se filtran solo las coincidencias que cumplan la longitud mínima
    coincidencias = [i for i in coincidencias if i.size >= longitud_min]
    # Se devuelve la lista de strings con las coincidencias
    return [str1[i.a:i.a + i.size] for i in coincidencias]
示例#22
0
def calc_similarity(s_standard, s_candidate):
    if s_standard is None or s_candidate is None:
        return 0

    m = SequenceMatcher(None, s_standard, s_candidate)
    if len(s_standard) >= len(s_candidate):
        return m.ratio()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    blocks = m.get_matching_blocks()
    scores = []
    for block in blocks:
        start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        end = start + len(s_standard)
        s_sub = s_candidate[start:end]

        m = SequenceMatcher(None, s_standard, s_sub)
        scores.append(m.ratio())

    return max(scores)
示例#23
0
    def _compare_lines(self, la, lb):
        sa = '\n'.join(la)
        sb = '\n'.join(lb)
        ta_result = ''
        tb_result = ''

        str_diff_start = '<em class="str-diff">'
        str_diff_end = '</em>'

        s = SequenceMatcher(None, sa, sb)
        cnt_a = Counter()
        cnt_b = Counter()
        for block in s.get_matching_blocks():
            (a_idx, b_idx, nmatch) = block
            print("a[%d] and b[%d] match for %d elements" % block)
            cnt_a.progress(a_idx, nmatch)
            cnt_b.progress(b_idx, nmatch)

            diff_a = cnt_a.slice_diff(sa)
            same_a = cnt_a.slice_match(sa)
            diff_b = cnt_b.slice_diff(sb)
            same_b = cnt_b.slice_match(sb)

            if diff_a or diff_b:
                ta_result += self._enclose(str_diff_start, diff_a, str_diff_end, consider_newline = True)
            ta_result += same_a
            if diff_a or diff_b:
                tb_result += self._enclose(str_diff_start, diff_b, str_diff_end, consider_newline = True)
            tb_result += same_b

            cnt_a.next()
            cnt_b.next()
        return (ta_result.split('\n'), tb_result.split('\n'))
示例#24
0
def partial_ratio(s1, s2):

    if s1 is None: raise TypeError("s1 is None")
    if s2 is None: raise TypeError("s2 is None")

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995: return 100
        else: scores.append(r)

    return int(100 * max(scores))
示例#25
0
def s(s1, s2):
    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")
    if len(s1) == 0 or len(s2) == 0 or len(s1) > len(s2):
        return
    shorter = s1
    longer = s2
    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()
    #print blocks
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]
        # print long_substr
        m2 = SequenceMatcher(None, shorter, long_substr)
        if m2.ratio() > 0.8:
            print shorter + "  :  " + long_substr
            print m2.ratio()
            print
            return 1
        #r = m2.ratio()
        # print r
        #scores.append(r)
    return 0
示例#26
0
def extract_dynamic_content_marking(seq1: str,
                                    seq2: str,
                                    autojunk: bool = True,
                                    isjunk=None,
                                    border_length=20
                                    ) -> typing.List[typing.Tuple[str, str]]:
    seqm = SequenceMatcher(isjunk, seq1, seq2, autojunk)
    blocks = list(seqm.get_matching_blocks())

    mached_markings = []
    while blocks:
        current_block = blocks.pop(0)
        if current_block.size < border_length:
            continue

        if not blocks:
            break

        for next_block in blocks:
            #next_block = blocks[0]
            if next_block.size < border_length:
                continue

            prefix = seq1[current_block.a:current_block.a +
                          current_block.size][-border_length:]
            suffix = seq1[next_block.a:next_block.a +
                          next_block.size][:border_length]

            mached_markings.append((prefix, suffix))
            break

    return mached_markings
示例#27
0
def plagerised_ratio(filename1, filename2):
    tokens1 = tokenize(
        filename1
    )  #(elements of cleaned up code, their position in original code, position in cleaned up code)
    file1 = toText(
        tokens1
    )  #cleaned up code - greatly increases effectiveness of plagiarism checker
    tokens2 = tokenize(filename2)
    file2 = toText(tokens2)
    SM = SequenceMatcher(None, file1, file2)
    similarity_ratio = SM.ratio()
    print(similarity_ratio)  # ratio of plagiarised content
    blocks = list(SM.get_matching_blocks(
    ))  #elements  of blocks[] - (start-file1, start-file2, length)
    blocks = blocks[:-1]
    f1 = open(filename1, "r")
    for i in blocks:
        flag = 0
        for j in range(len(tokens1)):
            if tokens1[j][2] == i[
                    0]:  #linking start of matching block to position in cleaned up code
                start = tokens1[j][
                    1]  #linking position in cleaned up code to position in original code file
                flag = 1
            if tokens1[j][2] == (i[0] + i[2] -
                                 1):  #linking end to cleaned up code
                end = tokens1[j][1]  #linking to original code file
                break
        if not flag == 0 and (
                end - start
        ) > 100:  #printing significant blocks of plagiarized content
            #the start and end of matching blocks is linked to the original code to properly mark the plagiarized content
            f1.seek(start, 0)
            print(f1.read(end - start))
示例#28
0
def html_diff(str1, str2, max_lenght=80, html_same_class="blue", html_diff_class="red"):
    from difflib import SequenceMatcher
    str1, str2 = align_strings(str1, str2, max_lenght)

    sm = SequenceMatcher(lambda x: x in " ")

    same_span = "<span style='color: %s'>" % html_same_class
    diff_span = "<span style='color: %s'>" % html_diff_class
    clos_span = "</span>"

    ret_str1 = []
    ret_str2 = []

    for str1, str2 in zip(str1, str2):
        temp_str1 = ""
        temp_str2 = ""

        finished = 0
        sm.set_seqs(str1, str2)

        for m in sm.get_matching_blocks():
            temp_str1 += diff_span + str1[finished:m[0]] + clos_span
            temp_str1 += same_span + str1[m[0]:m[0]+m[2]] + clos_span
            temp_str2 += diff_span + str2[finished:m[1]] + clos_span
            temp_str2 += same_span + str2[m[1]:m[1]+m[2]] + clos_span

            finished = m[0]+m[2]

        ret_str1 += [temp_str1]
        ret_str2 += [temp_str2]



    return ret_str1, ret_str2
示例#29
0
def diff_a_soup(s1, s2):
    # print(s1.prettify())
    sx_ind_limit = None
    ind = 0
    tabs = ''
    s1_list = []
    s1_ind_max = 0
    s1_list, s1_ind_max = rec_soup(s1, ind, s1_ind_max, sx_ind_limit, tabs,
                                   s1_list)
    s1_list_len = len(s1_list)
    ind = 0
    tabs = ''
    s2_list = []
    s2_ind_max = 0
    s2_list, s2_ind_max = rec_soup(s2, ind, s2_ind_max, sx_ind_limit, tabs,
                                   s2_list)
    s2_list_len = len(s2_list)
    seq = SequenceMatcher(None, s1_list, s2_list)
    # if s1_list_len <= s2_list_len:
    #     seq = SequenceMatcher(None, s1_list, s2_list)
    # else:
    #     seq = SequenceMatcher(None, s2_list, s1_list)
    match_block = seq.get_matching_blocks()
    print('Length of s1: ' + str(s1_list_len))
    print('s1 ind max: ' + str(s1_ind_max))
    print('Length of s2: ' + str(s2_list_len))
    print('s2 ind max: ' + str(s2_ind_max))
    print('Number of matched blocks: ' + str(len(match_block)))
    # pprint(match_block)
    return s1_list, s2_list, match_block
示例#30
0
def print_diffs(expected, actual):
    a = expected
    b = actual
    s = SequenceMatcher(None, a, b)
    print '\n'
    ctr = 0
    for block in s.get_matching_blocks():
        apos = block[0]
        bpos = block[0]
        aendpos = apos + block[2]
        bendpos = bpos + block[2]
        achunk = expected[apos:aendpos]
        bchunk = actual[bpos:bendpos]
        # print "a[%d] and b[%d] match for %d elements" % block
        print '\nACTUAL has matching Error at ' + str(aendpos)
        print 'Expected =' + expected[
            bendpos:bendpos + 100] + '\nFound    =' + actual[aendpos:aendpos +
                                                             100]
        print 'Matched values from 0 to ' + str(aendpos - 1) + ' are'
        print ' EXPECTED=' + bchunk
        print ' ACTUAL  =' + achunk
        print ''
        if ctr == 0:
            break
        else:
            ctr += 1
示例#31
0
def fuzzy_partial(s1, s2):
    """
    Helper method to compare similarity of two strings.
    Adapted and improved from:
    http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

    :param s1: string 1
    :param s2: string 2
    :return: similarity ratio
    """
    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    seq = SequenceMatcher(None, shorter, longer)
    matches = seq.get_matching_blocks()

    ratios = []
    for match in matches:
        long_start = match[1] - match[0] if (match[1] - match[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]
        seq2 = SequenceMatcher(None, shorter, long_substr)
        rat = seq2.ratio()
        if rat > .995:
            return 1
        else:
            ratios.append(rat)

    return max(ratios)
def create_dict_from_raw_dataset(string):
	dictionary=eval(string)
	snippet=dictionary["evidences"][0]["snippet"]
#	api_key= "XXXXX"
	service_url = 'https://www.googleapis.com/freebase/v1/topic'
	params = {
	  'key': api_key,
	  "filter":"suggest"
	}
	topic_id=dictionary["sub"]
	url = service_url + topic_id + '?' + urllib.urlencode(params)
	topic = json.loads(urllib.urlopen(url).read())
#	try:
	name=topic['property']["/type/object/name"]["values"][0]["text"]
#	except:
#		keys=[key in topic]
#		print keys
	topic_id=dictionary["obj"]
	url = service_url + topic_id + '?' + urllib.urlencode(params)
	topic = json.loads(urllib.urlopen(url).read())
	org=topic['property']["/type/object/name"]["values"][0]["text"]
	pseudo_list=tokenize.sent_tokenize(snippet.decode('utf-8'))
	sentence_list=[]
	for sentence in pseudo_list:
		words=sentence.split()
		pronouns=["He","She","he","she"]
		sent=""
		for word in words:
			if(word in pronouns):
				sent=sent+" "+name
			else:
				sent=sent+" "+word
		sent=sent[1:]
		sentence_list.append(sent)
	max_score=0
	for sentence in sentence_list:
		s=SM(None, sentence, org)
		score=sum(n for i,j,n in s.get_matching_blocks())
		if(score>max_score):
			max_score=score
			final_sentence=sentence
	yes_count=0
	no_count=0
	for judgment in dictionary["judgments"]:
		if(judgment["judgment"]=="yes"):
			yes_count=yes_count+1
		else:
			no_count=no_count+1
	if(yes_count>no_count):
		value=1
	else:
		value=0
	final_dictionary={}
	final_dictionary["value"]=value
	final_dictionary["sentence"]=final_sentence
	final_dictionary["name"]=name
	final_dictionary["organisation"]=org
	print final_dictionary
	return final_dictionary
示例#33
0
class StoryComparator:
    """
    Compares two stories by massaging the data and doing a 
    glorified diff, using Python's difflib.SequenceMatcher().
    
    The class is optimized around comparing one piece of control
    content to multiple pieces of variable content.
    """
    def __init__( self, original ):
        """
        The constructor takes the control content as a file
        (or StringIO) object, and reads and tokenizes it.
        """
        self._orig = original.read()
        self.orig = list( tokenize( self._orig ) )
        self.punk = [ x[0] for x in self.orig ]
        self.sm = SequenceMatcher( lambda x: x in " \t\n\r" )
        self.sm.set_seq2( self.punk )

    def compare( self, variable ):
        """
        compare() takes a piece of variable content.
        """
        results = []
        for lineno, line in enumerate( variable ):
            line_tk = tokenize( line )
            self.sm.set_seq1( [ x[ 0 ] for x in line_tk ] )
            winnowed = self._winnow( self.sm.get_matching_blocks() )
            if winnowed:
                results.append( ( lineno + 1, winnowed ) )
        return results

    def _winnow( self, matches ):
        winnowed = []
        prev = None
        for m in matches:
            # this is the dummy match at the end
            if m[ 2 ] == 0:
                continue
            if m[ 2 ] < 6:
                continue

            orig_beg_pos = self.orig[ m[1] ][1]
            orig_end_pos = self.orig[ m[1]+m[2] ][1]

            # XXX: here, the string length is getting conflated with the
            # word/token length.  BUT I HAVE THE POSITION OF THE LAST TOKEN

            g = { 'var': m[0], 'orig': m[1], 'len': m[2],
                  'orig_beg_pos': orig_beg_pos,
                  'orig_end_pos': orig_end_pos,
                  'orig_string': self._orig[ orig_beg_pos:orig_end_pos ],
                  'words': self.punk[ m[1]:m[1]+m[2] ] }
            prev = m
            winnowed.append( g )
        return winnowed
示例#34
0
def matchingString(s1, s2):
    '''Compare 2 sequence of strings and return the matching sequences concatenated'''
    from difflib import SequenceMatcher
    
    matcher = SequenceMatcher(None, s1, s2)
    output = ""
    for (i,_,n) in matcher.get_matching_blocks():
        output += s1[i:i+n]
    
    return output
示例#35
0
文件: Diff.py 项目: m-pilia/wikied
    def showDiff(self, before, after):
        """ Compute the diff and highlight changed parts.

        Parameters
        ----------
        self : QWidget
        before : str
            Original text.
        after : str
            Edited text.
        """

        beforeCursor = self.beforePTE.textCursor()
        afterCursor = self.afterPTE.textCursor()

        textFormat = QTextCharFormat()

        # delete any previous background
        textFormat.setBackground(QBrush(QColor('transparent')))
        beforeCursor.mergeCharFormat(textFormat)
        afterCursor.mergeCharFormat(textFormat)

        self.beforePTE.setPlainText(before)
        self.afterPTE.setPlainText(after)

        # get matching sequences
        sm = SequenceMatcher(a=before, b=after)
        i, j, k = 0, 0, 0

        # highlight mismatching sequences
        # NOTE: [ii:ii+kk] and [jj:jj+kk] are the matching sequences for the
        # first and second string, while [i+k:ii] and [j+k:jj] are the
        # mismatching ones
        for ii, jj, kk in sm.get_matching_blocks():

            # highlight with red the removed parts in the before text
            beforeCursor.setPosition(i + k)
            beforeCursor.movePosition(
                    QTextCursor.Right,
                    QTextCursor.KeepAnchor,
                    ii - i - k)
            textFormat.setBackground(QBrush(QColor('#F99')))
            beforeCursor.mergeCharFormat(textFormat)

            # highlight with green the added parts in the after text
            afterCursor.setPosition(j + k)
            afterCursor.movePosition(
                    QTextCursor.Right,
                    QTextCursor.KeepAnchor,
                    jj - j - k)
            textFormat.setBackground(QBrush(QColor('#CFC')))
            afterCursor.mergeCharFormat(textFormat)

            i, j, k = ii, jj, kk
示例#36
0
def compare(snippet1, snippet2):
  # TODO: convert punct and stuff to spaces with translate so it doesn't screw
  # up offsets
  sm = SequenceMatcher(lambda x: x in string.whitespace)
  sm.set_seq1(snippet1.unfscked_text().lower())
  sm.set_seq2(snippet2.unfscked_text().lower())

  # Note that the last block will always be of size 0
  for a, b, size in sm.get_matching_blocks():
    if size >= 5:
      yield a, size
示例#37
0
def compare(textA, textB):
    lenA = len(textA)
    lenB = len(textB)

    matcher = SequenceMatcher(None, textA, textB)
    blocks = matcher.get_matching_blocks()

    return {
        'changed': (1 - float(sum([m[2] for m in blocks])) / max(blocks[-1][0], blocks[-1][1])),
        'growth': float(lenB - lenA) / float(lenA),
        'lenA': lenA,
        'lenB': lenB,
        'similarity': matcher.ratio()
    }
示例#38
0
def annotate_filter(name, completions):
	maxdepth = max(map(lambda c: c["depth"], completions)) + 1

	lenn = len(name)
	for c in completions:
		cname = c["name"]
		if lenn > len(cname):
			continue # ignore shorter completions
		# if there is no identifier, give all equal chance
		if lenn == 0:
			score = 1
			c["markup"] = cname
		else:
			m = SequenceMatcher(None, name.lower(), cname.lower())

			blocks = list(m.get_matching_blocks())

			if len(blocks) < 2:
				continue # no matches

			# run again, up to the last matching char, to have substring-scores
			last = blocks[-2]
			lastchar = last.b + last.size
			m = SequenceMatcher(None, name.lower(), cname[:lastchar].lower())

			T = lenn + lastchar
			M = 0
			for tag, i1, i2, j1, j2 in m.get_opcodes():
				# TODO: score upper/lower casing differently from normal "replace"
				if tag == "equal":
					M += i2 - i1
			if M < lenn:
				continue # not all chars are included
			score = 2.0*M / T

			# XXX: gtksourceview does not apply highlights that include the frist char
			markup = "\u200B"
			laststart = 0
			for _, start, mlen in blocks:
				markup += cname[laststart:start]
				if mlen > 0:
					markup += "<b>" + cname[start:start+mlen] + "</b>"
				laststart = start + mlen
			c["markup"] = markup

		# deeper completions should rank lower
		depth = 1 - c["depth"] / maxdepth
		c["score"] = score * depth

		yield c
示例#39
0
def find_overlaps(seq1, seq2):
    """
    https://docs.python.org/3/library/difflib.html#sequencematcher-objects
    https://pypi.python.org/pypi/pydna/0.9.9
    https://pypi.python.org/pypi/biopython
    http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/
    http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/diffseq.html
    http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/seqmatchall.html
    http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/wordcount.html

    """
    sm = SequenceMatcher(a=seq1, b=seq2)
    blocks = sm.get_matching_blocks()
    return blocks
示例#40
0
 def generateRedirectRegExp(self, firstLocation, secondLocation):
     if firstLocation is None or secondLocation is None:
         return None
     sm = SequenceMatcher(None, firstLocation, secondLocation)
     marks = []
     for blocks in sm.get_matching_blocks():
         i = blocks[0]
         n = blocks[2]
         # empty block
         if n == 0:
             continue
         mark = firstLocation[i : i + n]
         marks.append(mark)
     regexp = "^.*{0}.*$".format(".*".join(map(re.escape, marks)))
     return regexp
示例#41
0
    def process(self, response):
        """
        Process data
        :return: str
        """

        if response.status in self.DEFAULT_STATUSES:
            super().process(response)
            length = self.__get_content_length()
            if self.MIN_CONTENT_LENGTH < length:
                # the page is allowed for comparison

                if not self.previous_item:
                    # 1st match. Push items for next compare step
                    self.previous_item.update({'length': length, 'text': self._body})
                    return None
                else:
                    if length == self.previous_item.get('length') and self.MIN_CONTENT_LENGTH < length:
                        # identical, seems to drop failed for success
                        return self.RESPONSE_INDEX
                    else:
                        matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body)
                        matcher.get_matching_blocks()

                        if 'length' in self.current_item:
                            next_matcher = SequenceMatcher(a=self.current_item['text'], b=self._body)
                            if next_matcher.ratio() == matcher.ratio():
                                return self.RESPONSE_INDEX
                        if self.MIN_RATIO_INDEX < matcher.ratio():
                            return self.RESPONSE_INDEX
                        else:
                            self.current_item.update({'length': length, 'text': self._body})

                    if self.MIN_CONTENT_LENGTH < length:
                        self.previous_item.update({'length': length, 'text': self._body})
        return None
示例#42
0
    def _align(self, dat_sent0, dat_nsp0, t_sent0, t_recv0, t_nsp0, i_nsp0):
        dat_sent = []; dat_nsp = []; t_sent = []; t_recv = []
        t_nsp = []; i_nsp = []
        
        diff = SequenceMatcher(None, dat_sent0, dat_nsp0) 
        for i, j, n in diff.get_matching_blocks():
            dat_sent.extend(dat_sent0[i: i+n])
            t_sent.extend(t_sent0[i: i+n])
            t_recv.extend(t_recv0[i: i+n])
            dat_nsp.extend(dat_nsp0[j: j+n])
            t_nsp.extend(t_nsp0[j: j+n])
            i_nsp.extend(i_nsp0[j: j+n])


        return dat_sent, dat_nsp, t_sent, t_recv, t_nsp, i_nsp
def calculate_scores(annotated_filepath, original_filepath):
    text = extract_annotated_text(annotated_filepath)

    expected_terms = re.findall(r"\w+", text.lower(), flags=re.UNICODE)

    article_extractor = MSSArticleExtractor()

    with open(original_filepath, "r") as f:
        contents = f.read()

    contents = html.document_fromstring(contents)

    contents = clean_html(contents)

    with codecs.open("cleaned_text.html", "w", encoding="utf-8") as f:
        f.write(tostring(contents))

    article = article_extractor.extract_article(tostring(contents))

    with codecs.open("text.html", "w", encoding="utf-8") as f:
        f.write(article)

    terms = re.findall(r"\w+", article.lower(), flags=re.UNICODE)

    matcher = SequenceMatcher(None, expected_terms, terms)

    matches = matcher.get_matching_blocks()

    sretsrel = sum([match.size for match in matches])
    srel = len(expected_terms)

    if terms:
        precision = float(sretsrel) / float(len(terms))
    else:
        precision = 0.0

    if srel > 0:
        recall = float(sretsrel) / float(srel)
    else:
        recall = 0.0

    try:
        f1 = 2 * ((precision * recall) / (precision + recall))
    except:
        f1 = 0.0

    return (precision, recall, f1)
示例#44
0
def partial_with_place(s1,s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")
    s1, s2 = utils.make_type_consistent(s1, s2)
    if len(s1) == 0 or len(s2) == 0:
        return 0

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    score_triple = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return (100, long_start, long_end)
        else:
            scores.append(r)
            score_triple.append((int(r * 100), long_start, long_end))

    m = max(scores)
    i = scores.index(m)

    return score_triple[i]
示例#45
0
文件: diff.py 项目: fpt/webtoys
    def compare(self, ta_lines, tb_lines, linebreak = ''):
        ta_result = []
        tb_result = []
        diff_lines = []

        line_diff_start = '<span class="line-num line-diff">'
        line_diff_end = '</span>'
        line_same_start = '<span class="line-num line-same">'
        line_same_end = '</span>'

        def _do_lines(diff, same, block_prefix, idx):
            result = []
            if diff:
                key = block_prefix + 'dl' + str(idx)
                result.append('<div class="diff-block" key="' + key + '" ref="' + key + '">')
                result.extend([self._enclose(line_diff_start, s, line_diff_end) for s in diff])
                result.append('</div>')
            result.extend([self._enclose(line_same_start, s, line_same_end) for s in same])
            return result

        s = SequenceMatcher(None, ta_lines, tb_lines)
        cnt_a = Counter()
        cnt_b = Counter()
        for block in s.get_matching_blocks():
            (a_idx, b_idx, nmatch) = block
            print("a[%d] and b[%d] match for %d elements" % block)
            cnt_a.progress(a_idx, nmatch)
            cnt_b.progress(b_idx, nmatch)

            diff_a = cnt_a.slice_diff(ta_lines)
            same_a = cnt_a.slice_match(ta_lines)
            diff_b = cnt_b.slice_diff(tb_lines)
            same_b = cnt_b.slice_match(tb_lines)

            if diff_a or diff_b:
                diff_a, diff_b = self._compare_lines(diff_a, diff_b)

            ta_result.extend(_do_lines(diff_a, same_a, 'a', cnt_a.current))
            tb_result.extend(_do_lines(diff_b, same_b, 'b', cnt_b.current))
            diff_lines.append(('adl' + str(cnt_a.current), 'bdl' + str(cnt_b.current)))

            cnt_a.next()
            cnt_b.next()

        return (linebreak.join(ta_result), linebreak.join(tb_result), diff_lines)
示例#46
0
    def get_initial_matches(self):
        """
        This does the main work of finding matching n-gram sequences between
        the texts.
        """
        sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
        matchingBlocks = sequence.get_matching_blocks()

        # Only return the matching sequences that are higher than the
        # threshold given by the user.
        highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]

        numBlocks = len(highMatchingBlocks)

        if numBlocks > 0:
            print('%s total matches found.' % numBlocks, flush=True)

        return highMatchingBlocks
示例#47
0
文件: weave.py 项目: bramcohen/Weave
 def diff(self, a, b):
     assert a != b
     flipped = False
     if a < b:
         a, b = b, a
         flipped = True
     key = a + b
     if key in self.diffcache:
         r = self.diffcache[key]
     else:
         sm = SequenceMatcher()
         sm.set_seqs(self.filecache[a], self.filecache[b])
         r = sm.get_matching_blocks()
         self.diffcache[key] = r
     if flipped:
         return [(v2, v1, v3) for (v1, v2, v3) in r]
     else:
         return r
def supa_changed(a, b):
    s = SequenceMatcher(a=a, b=b)
    a_end = -1
    b_end = -1
    for a_begin, b_begin, length in s.get_matching_blocks():
        if length == 0: continue
        if a_end != -1:
            a_skipped = a[a_end:a_begin]
            b_skipped = b[b_end:b_begin]
            if a_skipped == '' and b_skipped == '-DO':
                pass
            elif a_skipped == '???' and b_skipped == 'OBJ-DO':
                pass
            else:
                return True
                break
        a_end = a_begin + length
        b_end = b_begin + length
    return False
示例#49
0
 def findPatterns(self, leftSide, rightSide, numberOfIterations, riskFactor):
     '''
     Old outdated method, possibly useful in the future
     '''
     patterns = []
     sequenceMatcher = SequenceMatcher()
     sequenceMatcher.set_seqs(leftSide, rightSide)
     ratio = sequenceMatcher.ratio()
     print ratio
     if(True):
         matchingBlocks = sequenceMatcher.get_matching_blocks()
         print matchingBlocks
         for block in matchingBlocks:
             if(leftSide[block[0]:block[0] + block[2]] != '' and leftSide[block[0]:block[0] + block[2]] != ' ' ):
                 print "Found a pattern!"
                 print "Added:",leftSide[block[0]:block[0] + block[2]], "To the pattern list!"
                 patterns.append(leftSide[block[0]:block[0] + block[2]])
     
     return patterns
示例#50
0
 def test(self, result):
     result = [repr(unicode_compat(bit)) for bit in result]
     if self.expected == result:
         return BitDiffResult(True, "success")
     else:  # pragma: no cover
         longest = max(
             [len(x) for x in self.expected] +
             [len(x) for x in result] +
             [len('Expected')]
         )
         sm = SequenceMatcher()
         sm.set_seqs(self.expected, result)
         matches = sm.get_matching_blocks()
         lasta = 0
         lastb = 0
         data = []
         for match in [_backwards_compat_match(match) for match in matches]:
             unmatcheda = self.expected[lasta:match.a]
             unmatchedb = result[lastb:match.b]
             unmatchedlen = max([len(unmatcheda), len(unmatchedb)])
             unmatcheda += ['' for x in range(unmatchedlen)]
             unmatchedb += ['' for x in range(unmatchedlen)]
             for i in range(unmatchedlen):
                 data.append((False, unmatcheda[i], unmatchedb[i]))
             for i in range(match.size):
                 data.append((
                     True, self.expected[match.a + i], result[match.b + i]
                 ))
             lasta = match.a + match.size
             lastb = match.b + match.size
         padlen = (longest - len('Expected'))
         padding = ' ' * padlen
         line1 = '-' * padlen
         line2 = '-' * (longest - len('Result'))
         msg = '\nExpected%s |   | Result' % padding
         msg += '\n--------%s-|---|-------%s' % (line1, line2)
         for success, a, b in data:
             pad = ' ' * (longest - len(a))
             if success:
                 msg += '\n%s%s |   | %s' % (a, pad, b)
             else:
                 msg += '\n%s%s | ! | %s' % (a, pad, b)
         return BitDiffResult(False, msg)
示例#51
0
def partial_match(s1, s2):

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")
    s1, s2 = utils.make_type_consistent(s1, s2)
    if len(s1) == 0 or len(s2) == 0:
        return 0

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    matches = {}
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return {"ratio": 100, "match": long_substr}
        elif int(r * 1000) not in matches.keys():
            scores.append(r)
            matches[int(r * 1000)] = {"ratio": r * 100, "match": long_substr}

    return matches[int(max(scores)*1000)]
示例#52
0
def ordered_files(syncdir, toread):
  'Find the largest set of files in the correct order.'
  # Create a dict of files in syncdir with a valid index
  # In case of index clashes, the first candidate wins
  valid_files = {}
  for calibreid, filename in syncdir.items():
    index = file_index(filename)
    if index and index not in valid_files:
      valid_files[index] = calibreid
  # Create lists to compare to find the largest common sequence
  synclist = [valid_files[index] for index in sorted(valid_files.keys())]
  toreadlist = toread.keys()[:ARGS.count]
  logging.debug('Comparing %r and %r', synclist, toreadlist)
  # Use diffutils.SequenceMatcher to do the heavy lifting
  matcher = SequenceMatcher(None, toreadlist, synclist)
  ordered_ids = []
  for i, j, count in matcher.get_matching_blocks():
    ordered_ids.extend(toreadlist[i:i+count])
  logging.debug('Longest sorted subset: %r', ([
      syncdir[title] for title in ordered_ids],))
  return ordered_ids