Exemplo n.º 1
0
  def diff_lists( self, a, b ):
    """
    Diff two prepared lists and return the result as an HTML string.

    @type a: [ unicode, ... ]
    @type b: [ unicode, ... ]
    @rtype: unicode
    @return: composite HTML diff
    """
    matcher = SequenceMatcher( None, a, b )
    result = []
    open_tags = []

    # inspired by http://www.aaronsw.com/2002/diff/
    for ( change_type, i1, i2, j1, j2 ) in matcher.get_opcodes():
      if change_type == "replace":
        result.append(
          '<del class="diff modified">' + ''.join( a[ i1:i2 ] ) + '</del>' + \
          '<ins class="diff modified">' + ''.join( b[ j1:j2 ] ) + '</ins>'
        )
      elif change_type == "delete":
        result.append( '<del class="diff">' + ''.join( a[ i1:i2 ] ) + '</del>' )
      elif change_type == "insert":
        result.append( '<ins class="diff">' + ''.join( b[ j1:j2 ] ) + '</ins>' )
      elif change_type == "equal":
        result.append( ''.join( b[ j1:j2 ] ) )

    return "".join( result )
Exemplo n.º 2
0
def colordiff(a, b, highlight='red'):
    """Given two strings, return the same pair of strings except with
    their differences highlighted in the specified color.
    """
    a_out = []
    b_out = []
    
    matcher = SequenceMatcher(lambda x: False, a, b)
    for op, a_start, a_end, b_start, b_end in matcher.get_opcodes():
        if op == 'equal':
            # In both strings.
            a_out.append(a[a_start:a_end])
            b_out.append(b[b_start:b_end])
        elif op == 'insert':
            # Right only.
            b_out.append(colorize(highlight, b[b_start:b_end]))
        elif op == 'delete':
            # Left only.
            a_out.append(colorize(highlight, a[a_start:a_end]))
        elif op == 'replace':
            # Right and left differ.
            a_out.append(colorize(highlight, a[a_start:a_end]))
            b_out.append(colorize(highlight, b[b_start:b_end]))
        else:
            assert(False)
    
    return ''.join(a_out), ''.join(b_out)
Exemplo n.º 3
0
    def compare_strong_unequal_lengths(self, fp1, fp2):
        # Take fp1 to be shorter fingerprint
        #https://docs.python.org/2/library/difflib.html
        fp1c1 = fp1.T.iloc[0].tolist()[1:]
        fp2c1 = fp2.T.iloc[0].tolist()[1:]

        if len(fp1c1) > len(fp2c1):
            temp = fp1c1
            fp1c1 = fp2c1
            fp2c1 = temp
        
        # Want first fingerprint to be longer
        sm=SequenceMatcher(a=fp1c1,b=fp2c1)
        matched_intervals = []

        for (op, start1, end1, start2, end2) in sm.get_opcodes():
            #print (op, start1, end1, start2, end2)
            if op == 'equal':
                #This range appears in both sequences.
                for this_index, this_interval in enumerate(fp1c1[start1:end1]):
                    matched_intervals.append([this_interval, start1+this_index, start2+this_index])
            if op == 'delete' and end1-start1 == end2-start2:
                for this_index in range(end1-start1):
                    matched_intervals.append([np.nan, start1+this_index, start2+this_index])

        #print "Strong Beat Comparison (different lengths): " + str(matched_intervals)

        return matched_intervals
Exemplo n.º 4
0
    def weak_matching_helper(self, first_weaks, second_weaks):
        if len(first_weaks) == 1 and len(second_weaks) == 1 and np.isnan(first_weaks[0]) and np.isnan(second_weaks[0]):
                return [1.0, 1.0]
        if set(first_weaks) & set(second_weaks):
            # Choose shorter list... 
            if len(first_weaks) > len(second_weaks):
                temp = second_weaks
                second_weaks = first_weaks
                first_weaks = temp
            # Total weak beats that match
            total_weak_overlaps =  float(len(list(Counter(first_weaks) & Counter(second_weaks))))/float(len(first_weaks))
            # Total weak beatst that match in-order
            # Match sequences   
            sm=SequenceMatcher(a=first_weaks,b=second_weaks)
            total_weak_overlaps_inorder = 0

            for (op, start1, end1, start2, end2) in sm.get_opcodes():
                #print (op, start1, end1, start2, end2)
                if op == 'equal':
                    #This range appears in both sequences... add the length of the range including length 0s (1 index)
                    total_weak_overlaps_inorder += (end1 - start1)

            total_weak_overlaps_inorder = float(total_weak_overlaps_inorder)/float(len(first_weaks))
            # Tuple representation:
            return [total_weak_overlaps, total_weak_overlaps_inorder]
            #return 0.5*total_weak_overlaps + 0.5*total_weak_overlaps_inorder
        # Tuple representation:
        return [0, 0]
Exemplo n.º 5
0
def html_diff(old, new):
    """Generate HTML formatted diff of two strings."""
    diff = SequenceMatcher(None, old, new)
    result = []
    for tag, oldpos1, oldpos2, newpos1, newpos2 in diff.get_opcodes():
        if tag == 'replace':
            result.append(
                '<del>{0}</del><ins>{1}</ins>'.format(
                    old[oldpos1:oldpos2], new[newpos1:newpos2]
                )
            )
        elif tag == 'delete':
            result.append(
                '<del>{0}</del>'.format(
                    old[oldpos1:oldpos2]
                )
            )
        elif tag == 'insert':
            result.append(
                '<ins>{0}</ins>'.format(
                    new[newpos1:newpos2]
                )
            )
        elif tag == 'equal':
            result.append(new[newpos1:newpos2])
    return ''.join(result)
Exemplo n.º 6
0
 def _resolveDeepReplace(self, opcodes, a, b):
     """Resolves ``replace`` elements in `opcodes` pertaining to `a` and
     `b`. Returns opcodes including nested elements for these cases."""
     result = []
     for i in xrange(len(opcodes)):
         (opcode, aBeg, aEnd, bBeg, bEnd) = opcodes[i]
         if opcode != "replace":
             result.append(opcodes[i])
             continue
         self.hashableNodeImpl.pushRootOnly(True)
         try:
             sm = SequenceMatcher(self.isJunk, a[aBeg:aEnd], b[bBeg:bEnd])
             rootOpcodes = sm.get_opcodes()
             for j in xrange(len(rootOpcodes)):
                 (subOpcode, aSubBeg, aSubEnd, bSubBeg, bSubEnd) = rootOpcodes[j]
                 if subOpcode != "equal":
                     result.append((subOpcode, aBeg + aSubBeg, aBeg + aSubEnd, bBeg + bSubBeg, bBeg + bSubEnd))
                 else:
                     for k in xrange(aSubEnd - aSubBeg):
                         aIdx = aBeg + aSubBeg + k
                         bIdx = bBeg + bSubBeg + k
                         result.append(
                             ("descend", aIdx, aIdx + 1, bIdx, bIdx + 1, self._resolveRootEqual(a[aIdx], b[bIdx]))
                         )
         finally:
             self.hashableNodeImpl.popRootOnly()
     return result
Exemplo n.º 7
0
    def get_opcodes(self):
        """Return list of 5- or 6-tuples describing how to turn `a` into `b`.

Each tuple is of the form (tag, i1, i2, j1, j2, [sub]).  The first tuple
has i1 == j1 == 0, and remaining tuples have i1 == i2 from the
tuple preceding it, and likewise for j1 == the previous j2.

The tags are strings, with these meanings:

'replace':  a[i1:i2] should be replaced by b[j1:j2]
'delete':   a[i1:i2] should be deleted.
            Note that j1==j2 in this case.
'insert':   b[j1:j2] should be inserted at a[i1:i1].
            Note that i1==i2 in this case.
'equal':    a[i1:i2] == b[j1:j2]
'descend':  Descend on nodes a[i1] and b[i1]. In this case
            sub is a list of opcodes pertaining to the list of children
            of the two nodes.
            Note that i2==i1+1 and j2==j1+1 in this case.

Note that if the roots of the trees are not root-equal then the result
is only a 'replace' of one tree by the other.
"""

        self.hashableNodeImpl.pushRootOnly(True)
        try:
            sm = SequenceMatcher(self.isJunk, [self.a], [self.b])
            rootOpcodes = sm.get_opcodes()
            if rootOpcodes[0][0] == "equal":
                return [("descend", 0, 1, 0, 1, self._resolveRootEqual(self.a, self.b))]
            else:
                return rootOpcodes
        finally:
            self.hashableNodeImpl.popRootOnly()
Exemplo n.º 8
0
Arquivo: util.py Projeto: jas01/indico
def _diff_text(a, b, _noword_re=re.compile(r'(\W)')):
    # split the strings into words so we don't get changes involving
    # partial words.  this makes the diff much more readable to humans
    # as you don't end up with large deletions/insertions inside a word
    a = _noword_re.split(a)
    b = _noword_re.split(b)
    seqm = SequenceMatcher(a=a, b=b)
    output = []
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(''.join(seqm.a[a0:a1]))
        elif opcode == 'insert':
            inserted = _clean(seqm.b[b0:b1])
            output.append(Markup('<ins>{}</ins>').format(inserted))
        elif opcode == 'delete':
            deleted = _clean(seqm.a[a0:a1])
            output.append(Markup('<del>{}</del>').format(deleted))
        elif opcode == 'replace':
            deleted = _clean(seqm.a[a0:a1])
            inserted = _clean(seqm.b[b0:b1])
            output.append(Markup('<del>{}</del>').format(deleted))
            output.append(Markup('<ins>{}</ins>').format(inserted))
        else:
            raise RuntimeError('unexpected opcode: ' + opcode)
    return Markup('').join(output)
Exemplo n.º 9
0
def highlighted_ndiff(a, b):
    """Returns a highlited string, with bold charaters where different."""
    s = ''
    sm = SequenceMatcher()
    sm.set_seqs(a, b)
    linesm = SequenceMatcher()
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == REPLACE:
            for aline, bline in zip_longest(a[i1:i2], b[j1:j2]):
                if bline is None:
                    s += redline(aline)
                elif aline is None:
                    s += greenline(bline)
                else:
                    s += bold_str_diff(aline, bline, sm=linesm)
        elif tag == DELETE:
            for aline in a[i1:i2]:
                s += redline(aline)
        elif tag == INSERT:
            for bline in b[j1:j2]:
                s += greenline(bline)
        elif tag == EQUAL:
            for aline in a[i1:i2]:
                s += '  ' + aline + '\n'
        else:
            raise RuntimeError('tag not understood')
    return s
Exemplo n.º 10
0
Arquivo: diff.py Projeto: glanois/code
    def compare(self, a, b):
        cruncher = SequenceMatcher(self.linejunk, a, b)
        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
            if alo == ahi:
                f1 = '%d' % alo
            elif alo+1 == ahi:
                f1 = '%d' % (alo+1)
            else:
                f1 = '%d,%d' % (alo+1, ahi)

            if blo == bhi:
                f2 = '%d' % blo
            elif blo+1 == bhi:
                f2 = '%d' % (blo+1)
            else:
                f2 = '%d,%d' % (blo+1, bhi)

            if tag == 'replace':
                g = itertools.chain([ '%sc%s\n' % (f1, f2) ], self._my_plain_replace(a, alo, ahi, b, blo, bhi))
            elif tag == 'delete':
                g = itertools.chain([ '%sd%s\n' % (f1, f2) ], self._dump('<', a, alo, ahi))
            elif tag == 'insert':
                g = itertools.chain([ '%sa%s\n' % (f1, f2) ], self._dump('>', b, blo, bhi))
            elif tag == 'equal':
                g = []
            else:
                raise ValueError, 'unknown tag %r' % (tag,)

            for line in g:
                yield line
Exemplo n.º 11
0
    def difftokens(self, old, new):
        oldwords = self.tokenizer.split(old)
        newwords = self.tokenizer.split(new)

        def isgarbage(string):
            return string.isspace() and '\n' not in string
        differ = SequenceMatcher(isgarbage, a=oldwords, b=newwords)

        chunks = []
        colors = []

        def append(tokens, color):
            chunks.append(tokens)
            colors.append(color)

        for op, s1beg, s1end, s2beg, s2end in differ.get_opcodes():
            w1 = [w for w in oldwords[s1beg:s1end] if w]
            w2 = [w for w in newwords[s2beg:s2end] if w]

            if op == 'equal':
                append(w1, self.equalcolor)
            elif op == 'insert':
                append(w2, self.newcolor)
            elif op == 'replace':
                append(w1, self.oldcolor)
                append(w2, self.newcolor)
            elif op == 'delete':
                append(w1, self.oldcolor)

        return chunks, colors
Exemplo n.º 12
0
def diffleaves(aleaves, bleaves):
  def junkp(elt):
    assert isinstance(elt, leaf), 'tried to test junkiness of non-leaf'
    return isinstance(elt.content, str) and (len(elt.content) == 0 or elt.content.isspace())
  
  leaves = []
  diff = SequenceMatcher(a=aleaves, b=bleaves, autojunk=False, isjunk=junkp)
  for changetype, abeg, aend, bbeg, bend in diff.get_opcodes():
    if changetype == 'equal':
      leaves += aleaves[abeg:aend]
    elif changetype == 'delete':
      for adel in aleaves[abeg:aend]:
        adel.change = '-'
        leaves.append(adel)
    elif changetype == 'insert':
      for bins in bleaves[bbeg:bend]:
        bins.change = '+'
        leaves.append(bins)
    elif changetype == 'replace':
      for adel in aleaves[abeg:aend]:
        adel.change = '-'
        leaves.append(adel)
      for bins in bleaves[bbeg:bend]:
        bins.change = '+'
        leaves.append(bins)
  
  return leaves
Exemplo n.º 13
0
def get_diff_default(a, b):
    '''Возвращает различия между строками ``a`` и ``b`` в виде списка.
    Каждый элемент списка — кортеж из двух элментов, который показывает
    действие, которое нужно применить к ``a``, чтобы получить ``b``:

    - ``('=', длина)`` — кусок указанной длины одинаков для ``a`` и ``b``

    - ``('+', 'кусок')`` — этот кусок добавлен в новом тексте

    - ``('-', 'кусок')`` — этот кусок удалён из старого текста

    Используется встроенный класс Python ``difflib.SequenceMatcher``, который
    иногда медленный.
    '''

    from difflib import SequenceMatcher

    a = split_words(a)
    b = split_words(b)

    result = []

    s = SequenceMatcher(a=a, b=b, autojunk=False)
    for op, i1, i2, j1, j2 in s.get_opcodes():
        if op == 'equal':
            result.append(('=', len(''.join(a[i1:i2]))))
        if op == 'delete' or op == 'replace':
            result.append(('-', ''.join(a[i1:i2])))
        if op == 'insert' or op == 'replace':
            result.append(('+', ''.join(b[j1:j2])))

    return result
Exemplo n.º 14
0
 def get_from_text(self, old, new):
     """
     Gets the differences between `old` text and `new` text and returns a changeset
     :param old: old Text object
     :param new: new text string
     """
     olds = str(old)
     if olds == new:
         return None
     print repr(olds), repr(new)
     sm = SequenceMatcher(None, olds, new)
     print "    CS LENS  ", len(olds), len(new)
     csd = dict(old_len=len(olds),
                 new_len=len(new),
                 ops="",
                 char_bank="")
     opcodes = [opcode_tup for opcode_tup in sm.get_opcodes()]
     last_op = 0
     print "    CS OPC 1", opcodes
     for i in range(0, len(opcodes)):
         if opcodes[i][0] != "equal":
             last_op = i
     print "    CS OPC 2", opcodes[:last_op+1]
     for opcode_tup in opcodes[:last_op+1]:
         op_code_match(*opcode_tup, changeset=csd, sm=sm, text=old)
     print "    CS CSD  ", csd
     return csd
Exemplo n.º 15
0
    def get(self, snapa, snapb):
        sa = Snap.objects.get_or_404(id=snapa)
        sb = Snap.objects.get_or_404(id=snapb)

        fstcleanhtml = cleanhtml(sa.html)
        sndcleanhtml = cleanhtml(sb.html)
        sm = SequenceMatcher(None,
                             sndcleanhtml,
                             fstcleanhtml)
        txtinsert = []
        txtdel = []
        txtreplace = []
        for tag, i1, i2, j1, j2 in sm.get_opcodes():
            if tag == "replace":
                txtreplace.append(
                    ("%s <-> %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
            if tag == "insert":
                txtinsert.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
            if tag == "delete":
                txtdel.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
        return jsonify({
            'diff': {'fst': {'id': str(sa.id), 'dthr': sa.dthr},
                     'snd': {'id': str(sb.id), 'dthr': sb.dthr},
                     'ratio': sm.ratio(),
                     'insert': txtinsert,
                     'replace': txtreplace,
                     'delete': txtdel}
        })
Exemplo n.º 16
0
def WikiDocument(out, user_from, user_to, timestamp, subject, text):
    global previous
    ###url = get_url(id, prefix)
    ###header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
    ##############text = clean(text)
    subject = clean(subject)
    header = '%s\t%s\t%s\t%s\t' % (user_to, user_from, timestamp, subject)
    header = header.encode('utf-8')

    text = clean(text)
    ###find the diff
    s = SequenceMatcher(None, previous, text)
    opcodes = s.get_opcodes()

    diff = []
    for i in opcodes:
        if i[0] == 'insert' or i[0] == 'replace':
            j1 = i[3]
            j2 = i[4]
            diff.append(text[j1:j2])

    diff = "".join(diff)
    ###diff = clean(diff)
    ###
    out.reserve(len(header) + len(subject) + len(diff))
    print >> out, header,
    print >> out, diff.encode('utf-8')
    previous = text
Exemplo n.º 17
0
def diff_stat(old, new):
    result = [0, 0]  # [ADDED, REMOVED]

    def insert(i1, i2, j1, j2):
        result[ADDED] += j2 - j1

    def delete(i1, i2, j1, j2):
        result[REMOVED] += i2 - i1

    def update(i1, i2, j1, j2):
        result[REMOVED] += i2 - i1
        result[ADDED] += j2 - j1

    opcode_handler = {
        'insert': insert,
        'delete': delete,
        'replace': update,
        'equal': None,
    }

    sm = SequenceMatcher(None, old, new)

    for (tag, i1, i2, j1, j2) in sm.get_opcodes():
        f = opcode_handler[tag]
        if callable(f):
            f(i1, i2, j1, j2)

    return result
Exemplo n.º 18
0
def versioning(lists):
    """
    Compute the lifetime of every element from an iterable of sequences. It 
    returns an iterable of :class:`Versioned` classes to indicate the lifetimes.
    
    The computation is backed by the built-in :mod:`difflib` module. As such,
    every element must be hashable. 
    """
    
    ci = chain.from_iterable
    
    sm = SequenceMatcher()
    oldVersions = [ [] ]
    
    for newName, (oldList, newList) in enumerate(_pairwise(lists, [])):
        sm.set_seqs(oldList, newList)
        
        newVersions = [ oldVersions[0] ]

        for op, oldStart, oldEnd, newStart, newEnd in sm.get_opcodes():
            if op == 'equal':
                for i in range(oldStart+1, oldEnd+1):
                    oldVersions[i][0].high = newName
                newVersions.extend(oldVersions[oldStart+1:oldEnd+1])

            if op == 'delete' or op == 'replace':
                newVersions[-1].extend(ci(oldVersions[oldStart+1:oldEnd+1]))
            
            if op == 'insert' or op == 'replace':
                newVersions.extend([Versioned(x, newName, newName)] for x in newList[newStart:newEnd])
        
        oldVersions = newVersions
    
    return ci(oldVersions)
Exemplo n.º 19
0
    def difftokens(self, old, new):
        oldwords = self.tokenizer.findall(old)
        newwords = self.tokenizer.findall(new)

        def isgarbage(string):
            return string.isspace() #False
        differ = SequenceMatcher(isgarbage, a=oldwords, b=newwords)

        words = []
        colors = []

        def append(tokens, color):
            for token in tokens:
                words.append(token)
                colors.append(color)

        for op, s1beg, s1end, s2beg, s2end in differ.get_opcodes():
            w1 = oldwords[s1beg:s1end]
            w2 = newwords[s2beg:s2end]

            if op == 'equal':
                append(w1, self.equalcolor)
            elif op == 'insert':
                append(w2, self.newcolor)
            elif op == 'replace':
                append(w1, self.oldcolor)
                append(w2, self.newcolor)
            elif op == 'delete':
                append(w1, self.oldcolor)
        return words, colors
Exemplo n.º 20
0
def get_music(a, b, key='C', mode='major'):
    midi_out = StringIO()

    scale = build_scale(key, mode, octaves=1)
    matcher = SequenceMatcher(None, a, b)

    tone = key.lower()
    melodies = [tone]
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        next_note = None
        if tag == 'replace':
            next_note = 'r'
        elif tag == 'equal':
            next_note = tone
        elif tag == 'delete':
            tone = tone_down(tone, scale)
            next_note = tone
        elif tag == 'insert':
            tone = tone_up(tone, scale)
            next_note = tone
        melodies += [next_note] * ((i2 - i1) or 1)
    s = SMF([parse(" ".join(melodies))])

    s.write(midi_out)
    return midi_out
Exemplo n.º 21
0
def _modified_regions(old, new):
    m = SequenceMatcher(a=old, b=new, autojunk=False)

    regions = Regions()
    for tag, i1, i2, j1, j2 in m.get_opcodes():
        if tag != "equal":
            regions.extend(Region(i1, i2))
    return regions
def row(report_file, id, next, word, sentences, corrections):
    corrs = ''
    words = ''
    first = True
    for cor in corrections:
        word_layout = []
        word_index = -1
        correction_layout = []
        correction_index = -1
        s = SequenceMatcher(None, word, cor)
        for tag, i1, i2, j1, j2 in s.get_opcodes():
#            print ("%7s word[%d:%d] (%s) cor[%d:%d] (%s)" %(tag, i1, i2, word[i1:i2], j1, j2, cor[j1:j2]))
            if tag == 'delete':
                if word_index == -1:
                    word_layout.append(word[:i1])
                word_layout.append('<span style="background-color:#ff8888;">%s</span>' %word[i1:i2])
                word_index = i2
            elif tag == 'replace':
                word_layout.append('<span style="background-color:#ffcc88;">%s</span>' %word[i1:i2])
                word_index = i2
                correction_layout.append('<span style="background-color:#88ffcc;">%s</span>' %cor[j1:j2])
                correction_index = j2
            elif tag == 'equal':
                word_layout.append(word[i1:i2])
                word_index = i2
                correction_layout.append(cor[j1:j2])
                correction_index = j2
            elif tag == 'insert':
                correction_layout.append('<span style="background-color:#88ff88;">%s</span>' %cor[j1:j2])
                correction_index = j2
        if word_index == -1:
            word_layout = word
        else:
            word_layout = ''.join(word_layout)
        if correction_index == -1:
            correction_layout = cor
        else:
            correction_layout = ''.join(correction_layout)
        if first:
            first = False
            words += word_layout
            corrs += correction_layout
        else:
            words += '<br>' + word_layout
            corrs += '<br>' + correction_layout
    if next == 'X':
        next = '<strong>X</strong>'
#    if link:
#        if first:
#            report_file.write('<tr><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/spellingcontrole/next_version/bewerkWoordenlijst.php?wordfilter=^%s%%24">%s</a>&nbsp;</td><td>%s</td><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/woorddetails.php?word=%s">i</a>&nbsp;</td><td>%s</td><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/woorddetails.php?word=%s">i</a>&nbsp;</td><td>&nbsp;%1.2f</td></tr>\n' %(word, next, word_layout, word, correction_layout, correction, s.ratio()))
#        else:
#            report_file.write(u'<tr><td>&nbsp;</td><td>%s</td><td>↲&nbsp;</td><td>%s</td><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/woorddetails.php?word=%s">i</a>&nbsp;</td><td>&nbsp;%1.2f</td></tr>\n' %(word_layout, correction_layout, correction, s.ratio()))
#    else:
#        if first:
#            report_file.write('<tr><td>%s&nbsp;</td><td>%s</td><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/woorddetails.php?word=%s">i</a>&nbsp;</td><td>%s</td></tr>\n' %(next, word_layout, word, correction_layout))
#        else:
#            report_file.write('<tr><td>&nbsp;</td><td>%s</td><td></td><td>%s</td></tr>\n' %(word_layout, correction_layout))
    report_file.write('<tr><td>%s</td><td>%s</td><td><a target="_blank" href="http://data.opentaal.org/opentaalbank/woorddetails.php?word=%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' %(id, next, word, sentences, words, corrs))
Exemplo n.º 23
0
    def grade_student_output( self, student_out, reference_out, ignore_spaces = False ):

        differences = SequenceMatcher( None, student_out, reference_out )

        html = []
        html.append( '<pre>' )

        space_mismatch = 0

        for op, ob, oe, nb, ne in differences.get_opcodes():

            so = student_out[ob:oe].replace( "&", "&amp;" ).replace( "<", "&lt;" ).replace( ">", "&gt;" ).replace( "\n", "&para;<br>" )
            ro = reference_out[nb:ne].replace( "&", "&amp;" ).replace( "<", "&lt;" ).replace( ">", "&gt;" ).replace( "\n", "&para;<br>" )

            if op == 'insert':
                if ( ro.strip() or not ignore_spaces ):
                    html.append( '<ins>{}</ins>'.format( ro ) )
                else:
                    space_mismatch += len( ro )
            elif op == 'delete':
                if ( so.strip() or not ignore_spaces ):
                    html.append( '<del>{}</del>'.format( so ) )
                else:
                    html.append( '<span>{}</span>'.format( so ) )
                    space_mismatch += len( so )
            elif op == 'replace':
                html.append( '<del>{}</del>'.format( so ) )
                html.append( '<ins>{}</ins>'.format( ro ) )
            elif op == 'equal':
                html.append( '<span>{}</span>'.format( so ) )

        html.append( '</pre>' )

        # Compute the matching ratio between the
        # student output and the reference output

        # Default matching ratio
        # This is calculated by:
        #    2.0*M / T
        #        T is the total number of elements in both sequences
        #        M is the number of matches
        matching_ratio = differences.ratio()

        # Total lengths of student and reference outputs
        comb_len = len( student_out ) + len( reference_out )

        # Reverse engineer the formula used in ratio() to
        # get the default matching elements
        matching = matching_ratio * comb_len

        if ignore_spaces:

            # Add the elements mismatched due to spaces
            # in the the count of matching elements
            matching += space_mismatch

        return matching, comb_len, "".join( html )
def write_html_diff(f, modelA, modelB, fileA, fileB, aisA, aisB, afsA, afsB, sh):
    if fileA is not None:
        srcA = modelA.get_file_content(fileA)
    else:
        srcA = ''
    if fileB is not None:
        srcB = modelB.get_file_content(fileB)
    else:
        srcB = ''
    # For now, just work line-by-line
    s = SequenceMatcher(None,
                        srcA.splitlines(), srcB.splitlines())
    htmlA = sh.highlight_file(fileA, modelA).splitlines()
    htmlB = sh.highlight_file(fileB, modelB).splitlines()

    f.write('<table>\n')
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        def get_td(idx, html, ais, afs):
            if idx is not None:
                linenotes = ''
                for ai in ais:
                    if ai.line == idx + 1:
                        linenotes += make_issue_note(ai)
                for af in afs:
                    if af.line == idx + 1:
                        linenotes += make_failure_note(af)
                return '<td width="50%%"><pre>%s</pre>%s</td>' % (html[idx], linenotes)
            else:
                return '<td width="50%%"></td>'

        def add_line(class_, idxA, idxB):
            f.write('<tr class="%s">%s%s</tr>\n'
                    % (class_,
                       get_td(idxA, htmlA, aisA, afsA),
                       get_td(idxB, htmlB, aisB, afsB)))

        if tag == 'replace':
            # There's no guarantee that they have equal lengths,
            # so we can't directly use zip on the ranges.
            # Instead, calculate which has the longer range of lines
            # and keep interating, filling the other with blank lines
            maxlen = max(i2 - i1, j2 - j1)
            for i, j in zip(range(i1, i1 + maxlen),
                            range(j1, j1 + maxlen)):
                add_line('replace',
                         i if i < i2 else None,
                         j if j < j2 else None)
        elif tag == 'delete':
            for i in range(i1, i2):
                add_line('delete', i, None)
        elif tag == 'insert':
            for j in range(j1, j2):
                add_line('insert', None, j)
        elif tag == 'equal':
            for i, j in zip(range(i1, i2), range(j1, j2)):
                add_line('equal', i, j)
    f.write('</table>\n')
Exemplo n.º 25
0
class Diff(object):
    def __init__(self, old, new, **kwargs):
        self.old = old.splitlines()
        self.new = new.splitlines()
        self.surrounding = kwargs.get("surrounding", 2)
        self.blank_lines = kwargs.get("blank_lines", False)
        self.case_changes = kwargs.get("case_changes", False)
        self.whitespace = kwargs.get("whitespace", False)

        junk = self.whitespace and IS_CHARACTER_JUNK or None
        self.diff = SequenceMatcher(junk, self.old, self.new)

    def do_diff(self):
        if self.surrounding > -1:
            return self.__partial_diff()
        else:
            return self.__full_diff()

    def __partial_diff(self):
        return map(self.parse_section, self.diff.get_grouped_opcodes(self.surrounding))

    def __full_diff(self):
        return [self.parse_section(self.diff.get_opcodes())]

    def parse_section(self, section):
        lines = []
        for change_type, start_old, end_old, start_new, end_new in section:
            if change_type == "replace":
                lines.extend(self.handle_replace(xrange(start_old, end_old), xrange(start_new, end_new)))
            elif change_type == "insert":
                lines.extend(self.handle_insert(xrange(start_new, end_new)))
            elif change_type == "delete":
                lines.extend(self.handle_delete(xrange(start_old, end_old)))
            elif change_type == "equal":
                lines.extend(self.handle_equal(xrange(start_old, end_old), xrange(start_new, end_new)))
        return lines

    def handle_replace(self, deleted_lines, inserted_lines):
        raise NotImplementedError()

    def handle_insert(self, inserted_lines):
        length = len(inserted_lines) - 1
        for index, line in enumerate(inserted_lines):
            change_type = ("insert", (index is 0 and "start" or "") + (index is length and " end" or ""))
            yield change_type, "", line + 1, self.new[line]

    def handle_delete(self, deleted_lines):
        length = len(deleted_lines) - 1
        for index, line in enumerate(deleted_lines):
            change_type = ("delete", (index is 0 and "start" or "") + (index is length and " end" or ""))
            yield change_type, line + 1, "", self.old[line]

    def handle_equal(self, old_lines, new_lines):
        for index, line in enumerate(old_lines):
            change_type = ("equal", index is 0 and "start" or "end")
            new_line = new_lines[index]
            yield change_type, line + 1, new_line + 1, self.old[line]
Exemplo n.º 26
0
 def adapt_selections(cls, sender, **kwargs):
     s = SequenceMatcher(None, kwargs["old_text"], kwargs["new_text"])
     opcodes = s.get_opcodes()
     content_type = ContentType.objects.get_for_model(sender.__class__)
     annotations = Annotation.objects.filter(object_id=sender.id,
         content_type=content_type)
     for annotation in annotations:
         annotation.adapt_selection(opcodes)
         annotation.save()
Exemplo n.º 27
0
 def parse(self):
     if self.parsed:
         return
     sm = SequenceMatcher(a=self.a_text, b=self.b_text)
     for tag, ia1, ia2, ib1, ib2 in sm.get_opcodes():
         sub_a = self.a_text[ia1:ia2]
         sub_b = self.b_text[ib1:ib2]
         self.compare(ia1, ib1, sub_a, sub_b)
     self.parsed = True
Exemplo n.º 28
0
    def get_opcodes(self):
        sequence_matcher = SequenceMatcher(None, self.a, self.b)

        for tag, i1, i2, j1, j2 in sequence_matcher.get_opcodes():
            if tag == 'replace':
                oldlines = self.a[i1:i2]
                newlines = self.b[j1:j2]

                i = i_start = 0
                j = j_start = 0

                while i < len(oldlines) and j < len(newlines):
                    new_tag = None
                    new_i, new_j = i, j

                    if oldlines[i] == "" and newlines[j] == "":
                        new_tag = "equal"
                        new_i += 1
                        new_j += 1
                    elif oldlines[i] == "":
                        new_tag = "insert"
                        new_j += 1
                    elif newlines[j] == "":
                        new_tag = "delete"
                        new_i += 1
                    else:
                        new_tag = "replace"
                        new_i += 1
                        new_j += 1

                    if new_tag != tag:
                        if i > i_start or j > j_start:
                            yield (tag, i1 + i_start, i1 + i,
                                   j1 + j_start, j1 + j)

                        tag = new_tag
                        i_start, j_start = i, j

                    i, j = new_i, new_j

                yield tag, i1 + i_start, i1 + i, j1 + j_start, j1 + j
                i_start = i
                j_start = j

                if i2 > i1 + i_start or j2 > j1 + j_start:
                    tag = None

                    if len(oldlines) > len(newlines):
                        tag = "delete"
                    elif len(oldlines) < len(newlines):
                        tag = "insert"

                    if tag is not None:
                        yield tag, i1 + i_start, i2, j1 + j_start, j2
            else:
                yield tag, i1, i2, j1, j2
Exemplo n.º 29
0
    def __init__(self,
                 old_obj, new_obj,
                 previous_path=(), previous_new_path=(),
                 patch_extractors=[],
                 key_limits=KeyLimit(),
                 find_moved_patches=False,
                 moved_patches_similarity=0.8):

        super(ListPatchExtractor, self).__init__(old_obj, new_obj,
                                                 previous_path,
                                                 previous_new_path,
                                                 patch_extractors,
                                                 key_limits,
                                                 moved_patches_similarity)

        sequence = SequenceMatcher(None, self.make_hashable(old_obj),
                                   self.make_hashable(new_obj))

        group = 0

        for _tuple in sequence.get_opcodes():
            if _tuple[0] == 'insert':
                for i, new_path in enumerate(range(_tuple[3], _tuple[4])):
                    if not self._try_patch_extractors_for_ungrouping(_tuple[1]+i):
                        self._add_patch('add', _tuple[1]+i, None, new_obj[new_path], group)

            elif _tuple[0] == 'replace':
                old_range = range(_tuple[1], _tuple[2])
                new_range = range(_tuple[3], _tuple[4])

                for old_path, new_path in izip_longest(old_range, new_range):
                    if old_path is not None and new_path is not None:
                        if not self._try_patch_extractors(old_path, new_path):
                            self._add_patch('change', old_path,
                                            old_obj[old_path],
                                            new_obj[new_path],
                                            group)
                        
                        last_old_path = old_path
                    elif new_path is not None:
                        if not self._try_patch_extractors_for_ungrouping(last_old_path+1):
                            self._add_patch('add', last_old_path+1, None,
                                            new_obj[new_path], group)
                        last_old_path += 1
                    elif old_path is not None:
                        self._add_patch('remove', last_old_path+1, old_obj[old_path], None, group)

            elif _tuple[0] == 'delete':
                path = _tuple[1]
                for removal_key in range(_tuple[1], _tuple[2]):
                    self._add_patch('remove', path, old_obj[removal_key], None, group)

            group += 1

        if find_moved_patches:
            self.patches = self._find_moved_parts()
Exemplo n.º 30
0
def merge_rows(acc: Sequence[Row], new: Sequence[Row]) -> Iterator[Row]:
    sequence_matcher = SequenceMatcher(None, acc, new, False)
    for tag, _i1, _i2, j1, j2 in sequence_matcher.get_opcodes():
        if tag in ('insert', 'replace'):
            yield from new[j1:j2]
    yield from acc
Exemplo n.º 31
0
    group.add_argument('-c', '--compare', action='store_true',
        help='HTML comparison of tokenized diff to char diffs')
    data = parser.parse_args()

    lexer = pygments.lexers.get_lexer_by_name(data.lexername)

    a = data.file1.read()
    b = data.file2.read()

    data.unidiff = not data.verbose and not data.delta and not data.compare

    if data.verbose:
        lexa = list(pygments.lex(a, lexer))
        lexb = list(pygments.lex(b, lexer))
        sm = SequenceMatcher(None, lexa, lexb)
        for op, a1, a2, b1, b2 in sm.get_opcodes():
            if op == 'equal':
                for item in lexa[a1:a2]:
                    data.out.write("  %s: %s\n" % item)
            elif op == 'replace':
                data.out.write("~~~\n")
                for item in lexa[a1:a2]:
                    data.out.write("- %s: %s\n" % item)
                for item in lexb[b1:b2]:
                    data.out.write("+ %s: %s\n" % item)
                data.out.write("~~~\n")
            elif op == 'insert':
                for item in lexb[b1:b2]:
                    data.out.write("+ %s: %s\n" % item)
            elif op == 'delete':
                for item in lexa[a1:a2]:
Exemplo n.º 32
0
    def __alignMatrixDimension(self, cm, thisSeq, castSeq, axis=0):
        """
        Correct one dimension of contactMatrix by inserting and deleting
        columns, so that it can be later compared to contact matrices based
        on slightly different sequences.
        
        @param cm: contact matrix, 2D matrix of residue contacts
                   recceptor x ligand sequence
        @type  cm: array
        @param thisSeq: AA sequence of this dimension of the contactMatrix
        @type  thisSeq: string
        @param castSeq: AA sequence of this dimension in the other contact
        @type  castSeq: string
        @param axis: which dimension to adapt (0=receptor, 1=ligand)
        @type  axis: 1|0
        
        @return: contact matrix with residue contacts compatible to refSeq.
        @rtype: 2D array
        """
        # compare the two sequences
        seqdiff = SequenceMatcher(None, thisSeq, castSeq)
        seqDiff = seqdiff.get_opcodes()
        ## print seqDiff

        # decide which dimension to work on
        if not axis:
            cm = N0.transpose(cm)

        seqCount = 0  # keep track of sequence length changes
        i = 0

        for list in seqDiff:

            # remove the column corresponding to the deletion in the
            # docked sequence
            if str(seqDiff[i][0]) == 'delete':

                # separate matrix into before and after deletion
                matrixSeg1 = cm[:, :seqDiff[i][1] + seqCount]
                matrixSeg2 = cm[:, seqDiff[i][2] + seqCount:]
                # concatenate part
                cm = N0.concatenate((matrixSeg1, matrixSeg2), 1)
                seqCount = seqCount + seqDiff[i][1] - seqDiff[i][2]

            # inserts zeros in the column where there is a insertion in the
            # docked sequence
            if str(seqDiff[i][0]) == 'insert':

                # create a matrix to be inserted
                insertZeros = seqDiff[i][4] - seqDiff[i][3]
                insertColumns = N0.array([[0] * insertZeros] * N0.size(cm, 0))
                # separate matrix into before and after insertion
                matrixSeg1 = cm[:, :seqDiff[i][1] + seqCount]
                matrixSeg2 = cm[:, seqDiff[i][2] + seqCount:]
                # concatenate parts with the zero matrix
                cm = N0.concatenate((matrixSeg1, insertColumns, matrixSeg2), 1)
                seqCount = seqCount + seqDiff[i][4] - seqDiff[i][3]

            i = i + 1

        if not axis:
            return N0.transpose(cm)
        return cm
Exemplo n.º 33
0
 def diff(cls, v0, v1):
     differ = SequenceMatcher(v0, v1)
     return differ.get_opcodes()
Exemplo n.º 34
0
 def dist_from_request_addnl_fields(case_addnl_fields):
     sm = SequenceMatcher(None, request_addnl_fields, case_addnl_fields)
     opcodes = sm.get_opcodes()
     return sum(
         max(i2 - i1, j2 - j1) for op_type, i1, i2, j1, j2 in opcodes
         if op_type != 'equal')
Exemplo n.º 35
0
def align_dfs(da_unroll_df,
              ms_df,
              filenum,
              speaker,
              true_speaker,
              skip_nonverbal=True):
    list_row = []
    ms_toks_df = ms_df[ms_df['word'] != '[silence]']
    ms_toks_df.loc[:, 'word_norm'] = ms_toks_df.word.apply(norm_ms)
    if skip_nonverbal:
        ms_toks_df = ms_toks_df[ms_toks_df['word_norm'] != nonverbal_token]
        da_unroll_df = da_unroll_df[
            da_unroll_df['da_token'] != nonverbal_token]
        da_unroll_df = da_unroll_df[da_unroll_df['da_token'] != '...']
        da_unroll_df = da_unroll_df.reset_index()
    ms_toks_df = split_ms_toks(ms_toks_df)
    ms_toks_df = ms_toks_df.reset_index()
    ms_toks_df.loc[:, 'ms_tok_id'] = range(len(ms_toks_df))
    ms_toks_df.loc[:, 'su_id'] = ms_toks_df.utt_id.apply(
        lambda x: int(x.split('-')[-1]))
    da_unroll_df.loc[:, 'da_tok_id'] = range(len(da_unroll_df))
    ms_toks_df = ms_toks_df.set_index('ms_tok_id')
    da_toks_df = da_unroll_df.set_index('da_tok_id')
    # .get_opcodes returns ops to turn a into b
    ms_side = ms_toks_df.word_norm.tolist()
    ms_side = [x.lower() for x in ms_side]
    da_side = da_toks_df.da_token.tolist()
    #print(filenum, speaker, len(ms_side), len(da_side))
    sseq = SequenceMatcher(None, ms_side, da_side)
    for info in sseq.get_opcodes():
        tag, i1, i2, j1, j2 = info
        start_i = max(0, min(i1, len(ms_side) - 1))
        end_i = min(max(i1, i2 - 1), len(ms_side) - 1)
        start_j = max(0, min(j1, len(da_side) - 1))
        end_j = min(max(j1, j2 - 1), len(da_side) - 1)
        ms_part = ms_toks_df.loc[start_i:end_i]
        da_part = da_toks_df.loc[start_j:end_j]
        #if i2-i1 != j2-j1:
        #print(info, ms_side[i1:i2], da_side[j1:j2])
        #print(start_i, end_i, start_j, end_j)
        prev_ms_turn = ms_toks_df.loc[max(i1 - 1, 0)].su_id
        next_ms_turn = ms_toks_df.loc[min(i2, len(ms_side) - 1)].su_id
        prev_da_turn = da_toks_df.loc[max(j1 - 1, 0)].turn_id
        next_da_turn = da_toks_df.loc[min(j2, len(da_side) - 1)].turn_id
        if i2 - i1 == j2 - j1:
            # This is either equal or replace the same number of tokens
            # both cases transfer start/end times one by one
            for ms_idx, da_idx in zip(range(i1, i2), range(j1, j2)):
                da_token = da_part.loc[da_idx].da_token
                sent_id = da_part.loc[da_idx].sent_id
                turn_id = da_part.loc[da_idx].turn_id
                da_label = da_part.loc[da_idx].da_label
                start_time = ms_part.loc[ms_idx].start_time
                end_time = ms_part.loc[ms_idx].end_time
                ms_token = ms_part.loc[ms_idx].word_norm
                if not ms_token:
                    print(ms_part)
                list_row.append({
                    'filenum': filenum,
                    'da_speaker': speaker,
                    'true_speaker': true_speaker,
                    'ms_token': ms_token,
                    'da_token': da_token,
                    'sent_id': sent_id,
                    'turn_id': turn_id,
                    'da_label': da_label,
                    'start_time': start_time,
                    'end_time': end_time
                })
        elif tag == 'delete':
            # j1 = j2; da_part should be empty but the way .loc works
            # has da_toks.loc[j1]
            da_token = "<MISSED>"
            da_idx = start_j
            sent_id = da_part.loc[da_idx].sent_id
            turn_id = da_part.loc[da_idx].turn_id
            da_label = da_part.loc[da_idx].da_label
            for ms_idx in range(i1, i2):
                start_time = ms_part.loc[ms_idx].start_time
                end_time = ms_part.loc[ms_idx].end_time
                ms_token = ms_part.loc[ms_idx].word_norm
                if not ms_token:
                    print(ms_part)
                list_row.append({
                    'filenum': filenum,
                    'da_speaker': speaker,
                    'true_speaker': true_speaker,
                    'ms_token': ms_token,
                    'da_token': da_token,
                    'sent_id': sent_id,
                    'turn_id': turn_id,
                    'da_label': da_label,
                    'start_time': start_time,
                    'end_time': end_time
                })
        elif tag == "insert":
            # i1 = i2; ms_part should be empty but the way .loc works
            # has ms_toks.loc[i1]
            if next_ms_turn == ms_part.loc[end_i].su_id and \
                    next_da_turn == da_part.loc[end_j].turn_id:
                ms_idx = end_i
            else:
                ms_idx = max(i1 - 1, 0)
            ms_token = "<INSERTED>"
            start_time = ms_toks_df.loc[ms_idx].start_time
            end_time = ms_toks_df.loc[ms_idx].end_time
            for da_idx in range(j1, j2):
                da_token = da_part.loc[da_idx].da_token
                sent_id = da_part.loc[da_idx].sent_id
                turn_id = da_part.loc[da_idx].turn_id
                da_label = da_part.loc[da_idx].da_label
                if not ms_token:
                    print(ms_part)
                list_row.append({
                    'filenum': filenum,
                    'da_speaker': speaker,
                    'true_speaker': true_speaker,
                    'ms_token': ms_token,
                    'da_token': da_token,
                    'sent_id': sent_id,
                    'turn_id': turn_id,
                    'da_label': da_label,
                    'start_time': start_time,
                    'end_time': end_time
                })
        else:  #tag = "replace":
            len_toks = min(len(ms_part), len(da_part))
            range_ms = range(i1, i2)
            range_da = range(j1, j2)
            if prev_ms_turn == ms_part.loc[start_i].su_id \
                    and prev_da_turn == da_part.loc[start_j].turn_id:
                #print("front:", info, ms_side[i1:i2], da_side[j1:j2])
                for idx in range(len_toks):
                    ms_idx = range_ms[idx]
                    da_idx = range_da[idx]
                    da_token = da_part.loc[da_idx].da_token
                    sent_id = da_part.loc[da_idx].sent_id
                    turn_id = da_part.loc[da_idx].turn_id
                    da_label = da_part.loc[da_idx].da_label
                    start_time = ms_part.loc[ms_idx].start_time
                    end_time = ms_part.loc[ms_idx].end_time
                    ms_token = ms_part.loc[ms_idx].word_norm
                    if not ms_token:
                        print(ms_part)
                    list_row.append({
                        'filenum': filenum,
                        'da_speaker': speaker,
                        'true_speaker': true_speaker,
                        'ms_token': ms_token,
                        'da_token': da_token,
                        'sent_id': sent_id,
                        'turn_id': turn_id,
                        'da_label': da_label,
                        'start_time': start_time,
                        'end_time': end_time
                    })
                if len_toks < len(range_da):
                    while idx < len(range_da) - 1:
                        idx += 1
                        da_idx = range_da[idx]
                        da_token = da_part.loc[da_idx].da_token
                        sent_id = da_part.loc[da_idx].sent_id
                        turn_id = da_part.loc[da_idx].turn_id
                        da_label = da_part.loc[da_idx].da_label
                        ms_token = "<INSERTED>"
                        # start/end time of ms stays the same
                        if not ms_token:
                            print(ms_part)
                        list_row.append({
                            'filenum': filenum,
                            'da_speaker': speaker,
                            'true_speaker': true_speaker,
                            'ms_token': ms_token,
                            'da_token': da_token,
                            'sent_id': sent_id,
                            'turn_id': turn_id,
                            'da_label': da_label,
                            'start_time': start_time,
                            'end_time': end_time
                        })
                else:  # len_toks > len(range_da)
                    while idx < len(range_ms) - 1:
                        idx += 1
                        ms_idx = range_ms[idx]
                        da_token = "<MISSED>"
                        start_time = ms_part.loc[ms_idx].start_time
                        end_time = ms_part.loc[ms_idx].end_time
                        ms_token = ms_part.loc[ms_idx].word_norm
                        if not ms_token:
                            print(ms_part)
                        list_row.append({
                            'filenum': filenum,
                            'da_speaker': speaker,
                            'true_speaker': true_speaker,
                            'ms_token': ms_token,
                            'da_token': da_token,
                            'sent_id': sent_id,
                            'turn_id': turn_id,
                            'da_label': da_label,
                            'start_time': start_time,
                            'end_time': end_time
                        })
            else:
                #transfer token times from the back
                #print("back:", info, ms_side[i1:i2], da_side[j1:j2])
                temp_list = []
                for idx in range(len_toks):
                    ms_idx = i2 - idx - 1
                    da_idx = j2 - idx - 1
                    da_token = da_part.loc[da_idx].da_token
                    sent_id = da_part.loc[da_idx].sent_id
                    turn_id = da_part.loc[da_idx].turn_id
                    da_label = da_part.loc[da_idx].da_label
                    start_time = ms_part.loc[ms_idx].start_time
                    end_time = ms_part.loc[ms_idx].end_time
                    ms_token = ms_part.loc[ms_idx].word_norm
                    if not ms_token:
                        print(ms_part)
                    temp_list.append({
                        'filenum': filenum,
                        'da_speaker': speaker,
                        'true_speaker': true_speaker,
                        'ms_token': ms_token,
                        'da_token': da_token,
                        'sent_id': sent_id,
                        'turn_id': turn_id,
                        'da_label': da_label,
                        'start_time': start_time,
                        'end_time': end_time
                    })
                if len_toks < len(range_da):
                    while idx < len(range_da) - 1:
                        idx += 1
                        da_idx = range_da[len(range_da) - idx - 1]
                        da_token = da_part.loc[da_idx].da_token
                        sent_id = da_part.loc[da_idx].sent_id
                        turn_id = da_part.loc[da_idx].turn_id
                        da_label = da_part.loc[da_idx].da_label
                        ms_token = "<INSERTED>"
                        # start/end time of ms stays the same
                        if not ms_token:
                            print(ms_part)
                        temp_list.append({
                            'filenum': filenum,
                            'da_speaker': speaker,
                            'true_speaker': true_speaker,
                            'ms_token': ms_token,
                            'da_token': da_token,
                            'sent_id': sent_id,
                            'turn_id': turn_id,
                            'da_label': da_label,
                            'start_time': start_time,
                            'end_time': end_time
                        })
                else:  # len_toks > len(range_da)
                    while idx < len(range_ms) - 1:
                        idx += 1
                        ms_idx = range_ms[len(range_ms) - idx - 1]
                        da_token = "<MISSED>"
                        start_time = ms_part.loc[ms_idx].start_time
                        end_time = ms_part.loc[ms_idx].end_time
                        ms_token = ms_part.loc[ms_idx].word_norm
                        if not ms_token:
                            print(ms_part)
                        temp_list.append({
                            'filenum': filenum,
                            'da_speaker': speaker,
                            'true_speaker': true_speaker,
                            'ms_token': ms_token,
                            'da_token': da_token,
                            'sent_id': sent_id,
                            'turn_id': turn_id,
                            'da_label': da_label,
                            'start_time': start_time,
                            'end_time': end_time
                        })
                list_row += temp_list[::-1]

    ret_df = pd.DataFrame(list_row)
    return ret_df
Exemplo n.º 36
0
    def match_text(self,
                   r: TextSingleLinePredictionResult,
                   debug=False) -> MatchResult:
        # 1. Match prediction and gt strings
        # 2. Detect insertions/deletions/replacements/equal parts
        # 3. Assign predicted chars according to results of 2. to original syllables
        # 4. Compute the average x pos
        # 5. return

        pred = [(t, pos) for t, pos in r.text if t not in ' -']
        syls = r.line.operation.text_line.sentence.syllables
        assert (len(syls) > 0)

        # remove all "noisy" chars: ligatures/whitespace, ... for better match results
        def clean_text(t) -> str:
            # ß -> s only, not ss
            t = t.replace('ß', 's')
            t = unidecode.unidecode(t)
            return t.replace(' ', '').replace('-', '').lower()

        # Match the two sequences best possible
        gt = clean_text("".join([s.text for s in syls]))
        pred_txt = clean_text("".join([t for t, pos in pred]))
        sm = SequenceMatcher(a=pred_txt, b=gt, autojunk=False, isjunk=False)
        if debug:
            pt = PrettyTable(list(range(len(sm.get_opcodes()))))
            pt.add_row([
                gt[gt_start:gt_end]
                for _, _, _, gt_start, gt_end in sm.get_opcodes()
            ])
            pt.add_row([
                pred_txt[pred_start:pred_end] for _, pred_start, pred_end,
                gt_start, gt_end in sm.get_opcodes()
            ])
            pt.add_row([
                opcode for opcode, pred_start, pred_end, gt_start, gt_end in
                sm.get_opcodes()
            ])
            print(pt)

        matches = []
        for opcode, pred_start, pred_end, gt_start, gt_end in sm.get_opcodes():
            for i in range(gt_start, gt_end):
                if opcode == 'equal':
                    matches.append([pred[pred_start + i - gt_start]])
                elif opcode == 'insert':
                    matches.append([])
                elif opcode == 'delete':
                    # ignore (additional letter)
                    # maybe add to left or right
                    pass
                elif opcode == 'replace':
                    rel = (i - gt_start) / (gt_end - gt_start)
                    rel_ = (i + 1 - gt_start) / (gt_end - gt_start)
                    j = int(rel * (pred_end - pred_start) + pred_start)
                    j_ = int(rel_ * (pred_end - pred_start) + pred_start)
                    matches.append(list(pred[j:j_]))

        if debug:
            pt = PrettyTable(list(range(len(gt))))
            pt.add_row(list(gt))
            pt.add_row(matches)
            print(pt)

        pos = 0
        out_matches = []
        for syl in syls:
            m = sum(matches[pos:pos + len(syl.text)], [])
            if len(m) == 0:
                x = -1
            else:
                x = np.mean([p for _, p in m])
            out_matches.append({'s': syl, 'x': x})
            pos += len(syl.text)

        # interpolate syllables without any match
        ix = np.array([(i, match['x']) for i, match in enumerate(out_matches)
                       if match['x'] >= 0])
        x_pos = np.interp(range(len(out_matches)), ix[:, 0], ix[:, 1])

        return MatchResult(
            syllables=[
                SyllableMatchResult(
                    xpos=x,
                    syllable=match['s'],
                ) for match, x in zip(out_matches, x_pos)
            ],
            text_line=r.line.operation.text_line,
            music_line=r.line.operation.page.closest_music_line_to_text_line(
                r.line.operation.text_line),
        )
Exemplo n.º 37
0
def diff(err, cor):
    matcher = SequenceMatcher(None, err, cor)
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        er = err[i1:i2]
        co = cor[j1:j2]
        yield " ".join(er), " ".join(co), tag
Exemplo n.º 38
0
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == 'equal':
            yield l[i1:i2]
        elif tag == 'delete':
            yield '(' + l[i1:i2] + '?)'
        elif tag == 'insert':
            yield '(' + r[j1:j2] + '?)'
        elif tag == 'replace':
            yield '(' + l[i1:i2] + '|' + r[j1:j2] + ')'


merged = merge(a.split()[0], b.split()[0])
print(''.join(''.join(x) for x in merged))

s = SequenceMatcher(None, a, b)
for tag, i1, i2, j1, j2 in s.get_opcodes():
    print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(
        tag, i1, i2, j1, j2, a[i1:i2], b[j1:j2]))


with open('/home/ivan/Загрузки/taxi/urls', 'r', encoding='utf-8') as urls, \
     open('/home/ivan/Загрузки/taxi/pattern', 'w') as patterns:
    for url in urls:
        url = url.split()
        if not url:
            continue
        parsed_uri = urlparse(url[0])
        strin = ''
        try:
            strin = parsed_uri.netloc[parsed_uri.netloc.index('www.') + 4:]
        except ValueError:
Exemplo n.º 39
0
    def fix_string(self, verbosity=0):
        """Obtain the changes to a path as a string.

        We use the file_mask to do a safe merge, avoiding any templated
        sections. First we need to detect where there have been changes
        between the fixed and templated versions.

        We use difflib.SequenceMatcher.get_opcodes
        See: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.get_opcodes
        It returns a list of tuples ('equal|replace', ia1, ia2, ib1, ib2).

        """
        verbosity_logger("Persisting file masks: {0}".format(self.file_mask),
                         verbosity=verbosity)
        # Compare Templated with Raw
        diff_templ = SequenceMatcher(autojunk=None,
                                     a=self.file_mask[0],
                                     b=self.file_mask[1])
        diff_templ_codes = diff_templ.get_opcodes()
        verbosity_logger("Templater diff codes: {0}".format(diff_templ_codes),
                         verbosity=verbosity)

        # Compare Fixed with Templated
        diff_fix = SequenceMatcher(autojunk=None,
                                   a=self.file_mask[1],
                                   b=self.file_mask[2])
        # diff_fix = SequenceMatcher(autojunk=None, a=self.file_mask[1][0], b=self.file_mask[2][0])
        diff_fix_codes = diff_fix.get_opcodes()
        verbosity_logger("Fixing diff codes: {0}".format(diff_fix_codes),
                         verbosity=verbosity)

        # If diff_templ isn't the same then we should just keep the template. If there *was*
        # a fix in that space, then we should raise an issue
        # If it is the same, then we can apply fixes as expected.
        write_buff = ''
        fixed_block = None
        templ_block = None
        # index in raw, templ and fix
        idx = (0, 0, 0)
        loop_idx = 0
        while True:
            loop_idx += 1
            verbosity_logger("{0:04d}: Write Loop: idx:{1}, buff:{2!r}".format(
                loop_idx, idx, write_buff),
                             verbosity=verbosity)

            if templ_block is None:
                if diff_templ_codes:
                    templ_block = diff_templ_codes.pop(0)
                # We've exhausted the template. Have we exhausted the fixes?
                elif fixed_block is None:
                    # Yes - excellent. DONE
                    break
                else:
                    raise NotImplementedError(
                        "Fix Block left over! DOn't know how to handle this! aeflf8wh"
                    )
            if fixed_block is None:
                if diff_fix_codes:
                    fixed_block = diff_fix_codes.pop(0)
                else:
                    raise NotImplementedError(
                        "Unexpectedly depleted the fixes. Panic!")
            verbosity_logger("{0:04d}: Blocks: template:{1}, fix:{2}".format(
                loop_idx, templ_block, fixed_block),
                             verbosity=verbosity)

            if templ_block[0] == 'equal':
                if fixed_block[0] == 'equal':
                    # No templating, no fixes, go with middle and advance indexes
                    # Find out how far we can advance (we use the middle version because it's common)
                    if templ_block[4] == fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume both blocks
                        fixed_block = None
                        templ_block = None
                    elif templ_block[4] > fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume fixed block
                        fixed_block = None
                    elif templ_block[4] < fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:templ_block[4]]
                        # consume templ block
                        templ_block = None
                    idx = (idx[0] + len(buff), idx[1] + len(buff),
                           idx[2] + len(buff))
                    write_buff += buff
                    continue
                elif fixed_block[0] == 'replace':
                    # Consider how to apply fixes.
                    # Can we implement the fix while staying in the equal segment?
                    if fixed_block[2] <= templ_block[4]:
                        # Yes! Write from the fixed version.
                        write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                        idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                               fixed_block[2], fixed_block[4])
                        # Consume the fixed block because we've written the whole thing.
                        fixed_block = None
                        continue
                    else:
                        raise NotImplementedError("DEF")
                elif fixed_block[0] == 'delete':
                    # We're deleting items, nothing to write but we can consume some
                    # blocks and advance some indexes.
                    idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                           fixed_block[2], fixed_block[4])
                    fixed_block = None
                elif fixed_block[0] == 'insert':
                    # We're inserting items, Write from the fix block, but only that index moves.
                    write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                    idx = (idx[0], idx[1], fixed_block[4])
                    fixed_block = None
                else:
                    raise ValueError((
                        "Unexpected opcode {0} for fix block! Please report this "
                        "issue on github with the query and rules you're trying to "
                        "fix.").format(fixed_block[0]))
            elif templ_block[0] == 'replace':
                # We're in a templated section - we should write the templated version.
                # we should consume the whole replce block and then deal with where
                # we end up.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                new_templ_idx = templ_block[4]
                while True:
                    if fixed_block[2] > new_templ_idx >= fixed_block[1]:
                        # this block contains the end point
                        break
                    else:
                        if fixed_block[0] != 'equal':
                            print("WARNING: Skipping edit block: {0}".format(
                                fixed_block))
                        fixed_block = None
                # Are we exaclty on a join?
                if new_templ_idx == fixed_block[1]:
                    # GREAT - this makes things easy because we have an equality point already
                    idx = (templ_block[2], new_templ_idx, fixed_block[3])
                else:
                    if fixed_block[0] == 'equal':
                        # If it's in an equal block, we can use the same offset from the end.
                        idx = (templ_block[2], new_templ_idx, fixed_block[3] +
                               (new_templ_idx - fixed_block[1]))
                    else:
                        # TODO: We're trying to move through an templated section, but end up
                        # in a fixed section. We've lost track of indexes.
                        # We might need to panic if this happens...
                        print("UMMMMMM!")
                        print(new_templ_idx)
                        print(fixed_block)
                        raise NotImplementedError("ABC")
                write_buff += buff
                # consume template block
                templ_block = None
            elif templ_block[0] == 'delete':
                # The comparison, things that the templater has deleted
                # some characters. This is just a quirk of the differ.
                # In reality this means we just write these characters
                # and don't worry about advancing the other indexes.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                # consume templ block
                templ_block = None
                idx = (idx[0] + len(buff), idx[1], idx[2])
                write_buff += buff
            else:
                raise ValueError((
                    "Unexpected opcode {0} for template block! Please report this "
                    "issue on github with the query and rules you're trying to "
                    "fix.").format(templ_block[0]))

        return write_buff
Exemplo n.º 40
0
    def test_diff(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        seq1 = "ab ab2 abc3 abcd abc4".split()
        seq2 = "ab ab2 abc3 abc4 abc adb".split()
        diff = SequenceMatcher(a=seq1, b=seq2)
        nb = 0
        for opcode in diff.get_opcodes():
            fLOG(opcode)
            nb += 1
        self.assertEqual(nb, 4)

        h = 20
        size = 500, 500
        white = 255, 255, 255

        if is_travis_or_appveyor() == "travis":
            warnings.warn("pygame is not available")
            return
        pygame, screen, fonts = get_pygame_screen_font(h, size)

        from src.ensae_teaching_cs.helpers.pygame_helper import wait_event

        bars = [random.randint(10, 500) / 500.0 for s in seq2]
        screen.fill(white)
        build_diff_image(pygame,
                         screen,
                         h=h,
                         maxw=size[1],
                         seq1=seq1,
                         seq2=seq2,
                         diff=diff,
                         fonts=fonts,
                         bars=bars)
        pygame.display.flip()
        temp = get_temp_folder(__file__, "temp_video_diff")

        for i in range(0, 21):
            screen.fill(white)
            build_diff_image(pygame,
                             screen,
                             h=h,
                             maxw=size[0],
                             seq1=seq1,
                             seq2=seq2,
                             diff=diff,
                             fonts=fonts,
                             bars=bars,
                             progress=i / 20.0,
                             prev_bars=None)
            pygame.time.wait(60)
            pygame.display.flip()
            pygame.image.save(screen, os.path.join(temp, "diff%d.png" % i))

        if __name__ == "__main__":

            from src.ensae_teaching_cs.helpers.video_helper import make_video
            png = [
                os.path.join(temp, _) for _ in os.listdir(temp) if ".png" in _
            ]
            out = os.path.join(temp, "diff.avi")
            make_video(png, out, size=(350, 250), format="XVID", fps=5)

            wait_event(pygame)
class EmojiMatcher():
    '''A class to find Emoji which best match a query string'''
    def __init__(self,
                 languages=('en_US', ),
                 unicode_data=True,
                 cldr_data=True,
                 quick=True):
        '''
        Initialize the emoji matcher

        :param languages: A list of languages to use for matching emoji
        :type languages: List or tuple of strings
        :param unicode_data: Whether to load the UnicodeData.txt file as well
        :type unicode_data: Boolean
        :param cldr_data: Whether to load data from CLDR as well
        :type cldr_data: Boolean
        :param quick: Whether to do a quicker but slighly less precise match.
                      Quick matching is about 4 times faster and usually
                      good enough.
        :type quick: Boolean
        '''
        self._languages = languages
        self._quick = quick
        self._enchant_dicts = []
        if IMPORT_ENCHANT_SUCCESSFUL:
            for language in self._languages:
                if enchant.dict_exists(language):
                    self._enchant_dicts.append(enchant.Dict(language))
        # From the documentation
        # (https://docs.python.org/3.6/library/difflib.html):
        # “SequenceMatcher computes and caches detailed information
        # about the second sequence, so if you want to compare one
        # sequence against many sequences, use set_seq2() to set the
        # commonly used sequence once and call set_seq1() repeatedly,
        # once for each of the other sequences.”
        self._matcher = SequenceMatcher(isjunk=None,
                                        a='',
                                        b='',
                                        autojunk=False)
        self._match_cache = {}
        self._string1 = ''
        self._seq1 = ''
        self._len1 = 0
        self._string2 = ''
        self._string2_number_of_words = 0
        self._string2_word_list = []
        self._seq2 = ''
        self._len2 = 0
        self._emoji_dict = {}
        self._candidate_cache = {}
        # The three data sources are loaded in this order on purpose.
        # The data from Unicode is loaded first to put the official
        # names first into the list of names to display the official
        # names in the candidates, if possible.  The second best names
        # are the long names of emojione.
        if unicode_data:
            self._load_unicode_data()
        self._load_emojione_data()
        if cldr_data:
            for language in _expand_languages(self._languages):
                self._load_cldr_annotation_data(language)

    def get_languages(self):
        '''Returns a copy of the list of languages of this EmojiMatcher

        Useful to check whether an already available EmojiMatcher instance
        can be used or whether one needs a new instance because one needs
        a different list of languages.

        Note that the order of that list is important, a matcher which
        supports the same languages but in an different order might
        return different results.

        :rtype: A list of strings

        Examples:

        >>> m = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> m.get_languages()
        ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP']

        '''
        # Use list() to make a copy instead of self._languages[:] because
        # the latter might return the default tuple ('en_US',) instead
        # of a list ['en_US'] which makes comparison with another list
        # more inconvenient:
        return list(self._languages)

    def _add_to_emoji_dict(self, emoji_dict_key, values_key, values):
        '''Adds data to the emoji_dict if not already there'''
        if emoji_dict_key not in self._emoji_dict:
            self._emoji_dict[emoji_dict_key] = {values_key: values}
        else:
            if values_key not in self._emoji_dict[emoji_dict_key]:
                self._emoji_dict[emoji_dict_key][values_key] = values
            else:
                for value in values:
                    if (value not in self._emoji_dict[emoji_dict_key]
                        [values_key]):
                        self._emoji_dict[emoji_dict_key][values_key] += [value]

    def _load_unicode_data(self):
        '''Loads emoji names from UnicodeData.txt'''
        dirnames = (DATADIR, '/usr/share/unicode/ucd')
        basenames = ('UnicodeData.txt', )
        (path,
         open_function) = _find_path_and_open_function(dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_data(): could not find "%s" in "%s"\n' %
                (basenames, dirnames))
            return
        with open_function(path, mode='rt') as unicode_data_file:
            for line in unicode_data_file.readlines():
                if not line.strip():
                    continue
                codepoint_string, name, category = line.split(';')[:3]
                codepoint_integer = int(codepoint_string, 16)
                emoji_string = chr(codepoint_integer)
                if (category not in VALID_CATEGORIES
                        and emoji_string not in VALID_CHARACTERS):
                    continue
                self._add_to_emoji_dict((emoji_string, 'en'), 'names',
                                        [name.lower()])
                self._add_to_emoji_dict((emoji_string, 'en'), 'categories',
                                        [category.lower()])

    def _load_emojione_data(self):
        '''
        Loads emoji names, aliases, keywords, and categories from
        the emojione.json file.
        '''
        dirnames = (DATADIR, '/usr/lib/node_modules/emojione/')
        basenames = ('emojione.json', 'emoji.json')
        (path,
         open_function) = _find_path_and_open_function(dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_emojione_data(): could not find "%s" in "%s"\n' %
                (basenames, dirnames))
            return
        with open_function(path, mode='rt') as emoji_one_file:
            emojione = json.load(emoji_one_file)
        for dummy_emojione_key, emojione_value in emojione.items():
            codepoints = emojione_value['unicode']
            # ZWJ emojis are in the 'unicode_alternates' field:
            if ('unicode_alternates' in emojione_value
                    and '200d' in emojione_value['unicode_alternates']):
                codepoints = emojione_value['unicode_alternates']

            emoji_string = ''.join([
                chr(int(codepoint, 16)) for codepoint in codepoints.split('-')
            ])

            # emojione has names like “kiss (woman,woman)”, “couple
            # (man,man)” “family (man,man,girl,boy)”, “cocos (keeling)
            # islands”, “ceuta, melilla” …. The parentheses and commas
            # disturb the matching because my matching assumes that
            # words are seperated only by spaces. And they also match
            # too much for ASCII-smiley query strings like “:-)”. But
            # they are nicer for display. Therefore, if a name
            # contains such characters keep both the original name
            # (for display) and the name with these characters removed
            display_name = emojione_value['name'].lower()
            match_name = re.sub(r' ?[(,)] ?', r' ', display_name).strip(' ')
            names = [display_name]
            shortname = emojione_value['shortname'].replace('_',
                                                            ' ').strip(':')
            aliases = [
                x.replace('_', ' ').strip(':')
                for x in emojione_value['aliases']
            ]
            ascii_aliases = emojione_value['aliases_ascii']
            if match_name not in names:
                names += [match_name]
            if shortname not in names:
                names += [shortname]
            for alias in aliases + ascii_aliases:
                if alias not in names:
                    names += [alias]

            categories = [emojione_value['category']]
            # EmojiOne has duplicate entries in the keywords.  The
            # keywords also have random order (maybe because of the
            # way json.load(file) works?), sort them to get
            # reproducible output in the test cases (if the order
            # changes, which keyword matches last may change, that
            # does not change the score but it may have an effect on
            # the additional information added to the display string
            # added because of a keyword match).
            keywords = sorted(list(set(emojione_value['keywords'])))

            self._add_to_emoji_dict((emoji_string, 'en'), 'names', names)
            self._add_to_emoji_dict((emoji_string, 'en'), 'categories',
                                    categories)
            self._add_to_emoji_dict((emoji_string, 'en'), 'keywords', keywords)

    def _load_cldr_annotation_data(self, language):
        '''
        Loads translations of emoji names and keywords.

        Translations are loaded from the annotation data from CLDR.
        '''
        dirnames = (DATADIR,
                    '/local/mfabian/src/cldr-svn/trunk/common/annotations')
        basenames = (language + '.xml', )
        (path,
         open_function) = _find_path_and_open_function(dirnames, basenames)
        if not path:
            return
        # change language to the language of the file which was really
        # found (For example, it could be that 'es_ES' was requested,
        # but only the fallback 'es' was really found):
        language = os.path.basename(path).replace('.gz',
                                                  '').replace('.xml', '')
        with open_function(path, mode='rt') as cldr_annotation_file:
            pattern = re.compile(r'.*<annotation cp="(?P<emojistring>[^"]+)"' +
                                 r'\s*(?P<tts>type="tts"){0,1}' + r'[^>]*>' +
                                 r'(?P<content>.+)' + r'</annotation>.*')
            for line in cldr_annotation_file.readlines():
                match = pattern.match(line)
                if match:
                    emoji_string = match.group('emojistring')
                    if match.group('tts'):
                        self._add_to_emoji_dict((emoji_string, language),
                                                'names',
                                                [match.group('content')])
                    else:
                        self._add_to_emoji_dict(
                            (emoji_string, language), 'keywords', [
                                x.strip()
                                for x in match.group('content').split('|')
                            ])

    def _set_seq1(self, string):
        '''Sequence 1 is a label from the emoji data'''
        string = string.lower()
        self._string1 = string
        if not self._quick:
            # only needed when using SequenceMatcher()
            string = ' ' + string + ' '
            self._seq1 = string
            self._len1 = len(string)
            self._matcher.set_seq1(string)

    def _set_seq2(self, string):
        '''Sequence 2 is the query string, i.e. the user input'''
        string = string.lower()
        self._string2 = string
        # Split the input string into a list of words:
        word_list = []
        original_words = string.split(sep=None)
        self._string2_number_of_words = len(original_words)
        for word in original_words:
            word_list += [word]
            # If a word in the input string is not correctly spelled
            # in any of the enabled dictionaries, add spell checking
            # suggestions to the list (don’t do that it it is spelled
            # correctly in at least one dictionary):
            if len(word) > 3 and IMPORT_ENCHANT_SUCCESSFUL:
                spelled_correctly = False
                for dic in self._enchant_dicts:
                    if dic.check(word) or dic.check(word.title()):
                        spelled_correctly = True
                if not spelled_correctly:  # incorrect in *all* dictionaries
                    wlist = []
                    for dic in self._enchant_dicts:
                        # don’t use spellchecking suggestions shorter then
                        # 3 characters and lower case everything
                        wlist += [
                            x.lower() for x in dic.suggest(word) if len(x) > 2
                        ]
                    # remove possible duplicates from spellchecking
                    word_list += set(wlist)
        # Keep duplicates coming from the query string.
        # Sort longest words first.
        self._string2_word_list = sorted(word_list, key=lambda x: -len(x))
        if not self._quick:
            # only needed when using SequenceMatcher()
            string = ' ' + string + ' '
            self._seq2 = string
            self._len2 = len(string)
            self._matcher.set_seq2(string)
            self._match_cache = {}

    def _match(self, label, debug=False):
        '''Matches a label from the emoji data against the query string.

        The query string must have been already set with
        self._set_seq2(query_string) before calling self._match().

        '''
        self._set_seq1(label)
        total_score = 0
        if debug:
            print('string1 = “%s” string2 = “%s” string2_word_list = “%s”' %
                  (self._string1, self._string2, self._string2_word_list))
        if (self._string1, self._string2) in self._match_cache:
            # Many keywords are of course shared by many emoji,
            # therefore the query string is often matched against
            # labels already matched previously. Caching previous
            # matches speeds it up quite a bit.
            total_score = self._match_cache[(self._string1, self._string2)]
            if debug:
                print('Cached, total_score = %s' % total_score)
            return total_score
        # Does the complete query string match exactly?
        if self._string1 == self._string2:
            if debug:
                print('Exact match, total_score += 1000')
            total_score += 1000
        # Does a word in the query string match exactly?
        for word in set(self._string2_word_list):
            # use set() here to avoid making an exact match stronger
            # just because a word happens to be twice in the input.
            if word == self._string1:
                if self._string2_number_of_words == 1:
                    total_score += 300
                    if debug:
                        print('Spell check exact match, word = “%s”, ' % word +
                              'total_score += 300')
                else:
                    total_score += 200
                    if debug:
                        print('Exact match from word_list, word = “%s”, ' %
                              word + 'total_score += 200')
        # Does a word in the query string match the beginning of a word in
        # the label?
        tmp = self._string1
        for word in self._string2_word_list:
            match = re.search(r'\b' + re.escape(word), tmp)
            if match:
                match_value = 100 + match.end() - match.start()
                if match.start() == 0:
                    match_value += 20
                total_score += match_value
                tmp = tmp[:match.start()] + tmp[match.end():]
                if debug:
                    print('Substring match from word_list, word = “%s”, ' %
                          word + 'total_score += %s' % match_value)
        # Does a word in the query string match the label if spaces in
        # the label are ignored?
        tmp = self._string1.replace(' ', '')
        for word in self._string2_word_list:
            match = re.search(re.escape(word), tmp)
            if match:
                match_value = 20 + match.end() - match.start()
                if match.start() == 0:
                    match_value += 20
                total_score += match_value
                tmp = tmp[:match.start()] + tmp[match.end():]
                if debug:
                    print(
                        'Space insensitive substring match from word_list, ' +
                        'word = “%s”, ' % word +
                        'total_score += %s' % match_value)
        if self._quick:
            self._match_cache[(self._string1, self._string2)] = total_score
            return total_score
        # The following code using SequenceMatcher() might increase
        # the total_score by up to 500 approximately. It improves
        # the matching a little bit but it is very slow.
        if debug:
            print('seq1 = “%s” seq2 = “%s”' % (self._seq1, self._seq2))
        for tag, i1, i2, j1, j2 in self._matcher.get_opcodes():
            score = 0
            if tag in ('replace', 'delete', 'insert'):
                pass
            if tag == 'equal':
                match_length = i2 - i1
                if match_length > 1:
                    score += match_length
                    # favor word boundaries
                    if self._seq1[i1] == ' ':
                        if i1 == 0 and j1 == 0:
                            score += 4 * match_length
                        elif i1 == 0 or j1 == 0:
                            score += 2 * match_length
                        else:
                            score += match_length
                    if i1 > 0 and j1 > 0 and self._seq1[i1 - 1] == ' ':
                        score += match_length
                    if self._seq1[i2 - 1] == ' ':
                        if i2 == self._len1 and j2 == self._len2:
                            score += 4 * match_length
                        elif i2 == self._len1 or j2 == self._len2:
                            score += 2 * match_length
                        else:
                            score += match_length
            total_score += score
            if debug:
                print('{:7} a[{:2}:{:2}] --> b[{:2}:{:2}]'.format(
                    tag, i1, i2, j1, j2) + '{:3} {:3} {!r} --> {!r}'.format(
                        score, total_score, self._seq1[i1:i2],
                        self._seq2[j1:j2]))
        self._match_cache[(self._string1, self._string2)] = total_score
        return total_score

    def candidates(self, query_string, match_limit=20, debug=tuple()):
        '''
        Find a list of emoji which best match a query string.

        :param query_string: A search string
        :type query_string: string
        :param match_limit: Limit the number of matches to this amount
        :type match_limit: integer
        :param debug: List or tuple of emojis to print debug information
                      about the matching to stdout.
        :type debug: List of strings
        :rtype: A list of tuples of the form (<emoji>, <name>, <score),
                i.e. a list like this:
                [('🎂', 'birthday cake', 3106), ...]

        Examples:

        >>> mq = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

        >>> mq.candidates('😺', match_limit = 3)
        [('😺', "smiling cat face with open mouth ['animal', 'cat', 'face', 'happy', 'mouth', 'open', 'people', 'smile', 'so']", 9), ('😸', "grinning cat face with smiling eyes ['animal', 'cat', 'face', 'happy', 'people', 'smile', 'so']", 7), ('😃', "smiling face with open mouth ['face', 'happy', 'mouth', 'open', 'people', 'smile', 'so']", 7)]

        >>> mq.candidates('ant')[0][:2]
        ('🐜', 'ant')

        >>> mq.candidates('ameise')[0][:2]
        ('🐜', 'Ameise')

        >>> mq.candidates('Ameise')[0][:2]
        ('🐜', 'Ameise')

        >>> mq.candidates('formica')[0][:2]
        ('🐜', 'formica')

        >>> mq.candidates('hormiga')[0][:2]
        ('🐜', 'hormiga')

        >>> mq.candidates('cacca')[0][:2]
        ('💩', 'cacca')

        >>> mq.candidates('orso')[0][:2]
        ('🐻', 'faccina orso')

        >>> mq.candidates('lupo')[0][:2]
        ('🐺', 'faccina lupo')

        >>> mq.candidates('gatto')[0][:2]
        ('🐈', 'gatto')

        >>> mq.candidates('gatto sorride')[0][:2]
        ('😺', 'gatto che sorride')

        Any white space and '_' can be used to separate keywords in the
        query string:

        >>> mq.candidates('gatto_	 sorride')[0][:2]
        ('😺', 'gatto che sorride')

        >>> mq.candidates('nerd glasses')[0][:2]
        ('🤓', 'nerd face')

        >>> mq.candidates('smiling face eye sun glasses')[0][:2]
        ('😎', 'smiling face with sunglasses')

        >>> mq.candidates('halo')[0][:2]
        ('😇', 'smiling face with halo')

        >>> mq.candidates('factory')[0][:2]
        ('🏭', 'factory')

        >>> mq.candidates('man tone5')[0][:2]
        ('👨🏿', 'man tone 5 “man tone5”')

        >>> mq.candidates('mantone5')[0][:2]
        ('👨🏿', 'man tone 5')

        >>> mq.candidates('tone')[0][:2]
        ('👎🏻', 'thumbs down sign tone 1 “thumbdown tone1”')

        >>> mq.candidates('tone1')[0][:2]
        ('🏻', 'emoji modifier fitzpatrick type-1-2 “light skin tone”')

        >>> mq.candidates('tone5')[0][:2]
        ('🏿', 'emoji modifier fitzpatrick type-6 “dark skin tone”')

        >>> mq.candidates('a')[0][:2]
        ('🅰', 'negative squared latin capital letter a “A button (blood type)”')

        >>> mq.candidates('squared a')[0][:2]
        ('🅰', 'negative squared latin capital letter a “A button (blood type)”')

        >>> mq.candidates('squared capital a')[0][:2]
        ('🅰', 'negative squared latin capital letter a “A button (blood type)”')

        >>> mq.candidates('c')[0][:2]
        ('©', 'Copyright')

        >>> mq.candidates('us')[0][:2]
        ('🇺🇸', 'united states “us”')

        >>> mq.candidates('flag us')[0][:2]
        ('🇺🇸', 'united states “us” [flag]')

        >>> mq.candidates('united states')[0][:2]
        ('🇺🇸', 'united states')

        >>> mq.candidates('united')[0][:2]
        ('🇺🇸', 'united states')

        >>> mq.candidates('united minor')[0][:2]
        ('🇺🇲', 'united states minor outlying islands')

        >>> mq.candidates('united arab')[0][:2]
        ('🇦🇪', 'the united arab emirates')

        >>> mq.candidates('mm')[0][:2]
        ('🇲🇲', 'myanmar “mm”')

        >>> mq.candidates('flag mm')[0][:2]
        ('🇲🇲', 'myanmar “mm” [flag]')

        >>> mq.candidates('myanmar')[0][:2]
        ('🇲🇲', 'myanmar')

        >>> mq.candidates('sj')[0][:2]
        ('🇸🇯', 'svalbard and jan mayen “sj”')

        >>> mq.candidates('flag sj')[0][:2]
        ('🇸🇯', 'svalbard and jan mayen “sj” [flag]')

        >>> mq.candidates('svalbard')[0][:2]
        ('🇸🇯', 'svalbard and jan mayen')

        >>> mq.candidates('jan mayen')[0][:2]
        ('🇸🇯', 'svalbard and jan mayen')

        >>> mq.candidates('mayen')[0][:2]
        ('🇸🇯', 'svalbard and jan mayen')

        >>> mq.candidates(':-)')[0][:2]
        ('🙂', 'slightly smiling face “:-)”')

        >>> mq.candidates('family')[0][:2]
        ('👪', 'family')

        >>> mq.candidates('man')[0][:2]
        ('👨', 'man')

        >>> mq.candidates('woman')[0][:2]
        ('👩', 'woman')

        >>> mq.candidates('girl')[0][:2]
        ('👧', 'girl')

        >>> mq.candidates('boy')[0][:2]
        ('👦', 'boy')

        >>> mq.candidates('family man')[0][:2]
        ('👨\u200d👨\u200d👦\u200d👦', 'family (man,man,boy,boy) “family mmbb”')

        >>> mq.candidates('man man girl boy')[0][:2]
        ('👨\u200d👨\u200d👧\u200d👦', 'family (man,man,girl,boy) “family man man girl boy”')

        >>> mq.candidates('mmgb')[0][:2]
        ('👨\u200d👨\u200d👧\u200d👦', 'family (man,man,girl,boy) “family mmgb”')

        >>> mq.candidates('manmangirlboy')[0][:2]
        ('👨\u200d👨\u200d👧\u200d👦', 'family (man,man,girl,boy)')

        >>> mq.candidates('bird')[0][:2]
        ('🐦', 'bird')

        >>> mq.candidates('bir')[0][:2]
        ('🎂', 'birthday cake')

        >>> mq.candidates('birth')[0][:2]
        ('🎂', 'birthday cake')

        >>> mq.candidates('camera')[0][:2]
        ('📷', 'camera')

        >>> mq.candidates('symbol')[0][:2]
        ('🔣', 'input symbol for symbols “input symbols”')

        >>> mq.candidates('atomsymbol')[0][:2]
        ('⚛', 'atom symbol')

        >>> mq.candidates('peacesymbol')[0][:2]
        ('☮', 'peace symbol')

        >>> mq.candidates('peace symbol')[0][:2]
        ('☮', 'peace symbol')

        >>> mq.candidates('animal')[0][:2]
        ('🐜', 'ant [animal]')

        >>> mq.candidates('dromedary animal')[0][:2]
        ('🐪', 'dromedary camel')

        >>> mq.candidates('camel')[0][:2]
        ('🐫', 'bactrian camel “two-hump camel”')

        >>> mq.candidates('people')[0][:2]
        ('👯', 'woman with bunny ears “people with bunny ears partying”')

        >>> mq.candidates('nature')[0][:2]
        ('🌼', 'blossom {nature}')

        >>> mq.candidates('thankyou')[0][:2]
        ('🍻', 'clinking beer mugs [thank you]')

        >>> mq.candidates('travel')[0][:2]
        ('🚡', 'aerial tramway {travel}')

        >>> mq.candidates('ferry')[0][:2]
        ('⛴', 'ferry')

        >>> mq.candidates('ferry travel')[0][:2]
        ('⛴', 'ferry {travel}')

        >>> mq.candidates('ferry travel boat')[0][:2]
        ('⛴', 'ferry {travel}')

        >>> mq.candidates('boat')[0][:2]
        ('🛥', 'motor boat')

        >>> mq.candidates('anchor')[0][:2]
        ('⚓', 'anchor')

        >>> mq.candidates('anchor boat')[0][:2]
        ('⚓', 'anchor [boat]')

        >>> mq.candidates('buterfly')[0][:2]
        ('\U0001f98b', 'butterfly')

        >>> mq.candidates('badminton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

        >>> mq.candidates('badmynton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

        >>> mq.candidates('padminton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

        >>> mq.candidates('fery')[0][:2]
        ('⛴', 'ferry')

        >>> mq.candidates('euro sign')[0][:2]
        ('€', 'euro sign')

        >>> mq = EmojiMatcher(languages = ['fr_FR'])
        >>> mq.candidates('chat')[0][:2]
        ('🐈', 'chat')

        >>> mq.candidates('réflexion')[0][:2]
        ('🤔', 'visage en pleine réflexion')

        >>> mq.candidates('🤔', match_limit = 3)
        [('🤔', "visage en pleine réflexion ['réflexion', 'visage']", 2), ('💆\u200d♀', "femme qui se fait masser le visage ['visage']", 1), ('💆\u200d♂', "homme qui se fait masser le visage ['visage']", 1)]

        >>> mq = EmojiMatcher(languages = ['fr_FR'])
        >>> mq.candidates('2019')
        [('’', 'U+2019 RIGHT SINGLE QUOTATION MARK', 200)]

        >>> mq.candidates('41')
        [('A', 'U+41 LATIN CAPITAL LETTER A', 200)]

        >>> mq.candidates('2a')
        [('*', 'U+2A ASTERISK', 200)]

        This does not work because unicodedata.name(char) fails
        if for control characters:

        >>> mq.candidates('1b')
        []
        '''
        # Replace any sequence of white space characters and '_' in
        # the query string with a single ' ':
        query_string = re.sub('[_\s]+', ' ', query_string)
        if ((query_string, match_limit) in self._candidate_cache
                and not debug):
            return self._candidate_cache[(query_string, match_limit)]
        if (query_string, 'en') in self._emoji_dict:
            # the query_string is itself an emoji, match similar ones:
            candidates = self.similar(query_string, match_limit=match_limit)
            self._candidate_cache[(query_string, match_limit)] = candidates
            return candidates
        self._set_seq2(query_string)
        candidates = []
        for emoji_key, emoji_value in self._emoji_dict.items():
            if emoji_key[0] in debug:
                debug_match = True
                print('===================================')
                print('Debug match for “%s”' % emoji_key[0])
                print('===================================')
            else:
                debug_match = False

            total_score = 0
            good_match_score = 200
            name_good_match = ''
            category_good_match = ''
            keyword_good_match = ''
            if 'names' in emoji_value:
                for name in emoji_value['names']:
                    score = 2 * self._match(name, debug=debug_match)
                    if score >= good_match_score:
                        name_good_match = name
                    total_score += score
            if 'categories' in emoji_value:
                for category in emoji_value['categories']:
                    score = self._match(category, debug=debug_match)
                    if score >= good_match_score:
                        category_good_match = category
                    total_score += score
            if 'keywords' in emoji_value:
                for keyword in emoji_value['keywords']:
                    score = self._match(keyword, debug=debug_match)
                    if score >= good_match_score:
                        keyword_good_match = keyword
                    total_score += score

            if total_score > 0:
                if 'names' in emoji_value:
                    display_name = emoji_value['names'][0]
                else:
                    display_name = self.name(emoji_key[0])
                if (len(emoji_key[0]) == 1 and unicodedata.category(
                        emoji_key[0]) in ('Cc', 'Cf', 'Zs')):
                    # Add the code point to the display name of
                    # “invisible” characters:
                    display_name = ('U+%X' % ord(emoji_key[0]) + ' ' +
                                    display_name)
                # If the match was good because something else
                # but the main name had a good match, show it in
                # the display name to make the user understand why
                # this emoji matched:
                if name_good_match not in display_name:
                    display_name += ' “' + name_good_match + '”'
                if category_good_match not in display_name:
                    display_name += ' {' + category_good_match + '}'
                if keyword_good_match not in display_name:
                    display_name += ' [' + keyword_good_match + ']'
                candidates.append((emoji_key[0], display_name, total_score))

        try:
            codepoint = int(query_string, 16)
            if codepoint >= 0x0 and codepoint <= 0x1FFFFF:
                char = chr(codepoint)
                candidates.append((char, 'U+' + query_string.upper() + ' ' +
                                   unicodedata.name(char), 200))
        except (ValueError, ):
            pass

        sorted_candidates = sorted(candidates,
                                   key=lambda x:
                                   (-x[2], -len(x[0]), x[1]))[:match_limit]

        self._candidate_cache[(query_string, match_limit)] = sorted_candidates
        return sorted_candidates

    def name(self, emoji_string):
        '''Find a name of an emoji.

        Returns a name of the emoji in the first language given
        for which where a name can be found.

        :param emoji_string: The string of Unicode characters which are
                             used to encode the emoji
        :type emoji_string: A string
        :rtype: string

        Examples:

        >>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

        >>> matcher.name('🏸')
        'badminton racquet and shuttlecock'

        >>> matcher.name('🖥')
        'desktop computer'

        >>> matcher = EmojiMatcher(languages=['es_MX', 'es_ES', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'computadora de escritorio'

        >>> matcher = EmojiMatcher(languages=['es_ES', 'es_MX', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'ordenador de sobremesa'

        >>> matcher = EmojiMatcher(languages=['de_DE', 'es_ES', 'es_MX', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'Computer'

        >>> matcher = EmojiMatcher(languages=['it_IT', 'es_ES', 'es_MX', 'ja_JP'])
        >>> matcher.name('🖥')
        'desktop PC'

        >>> matcher = EmojiMatcher(languages=['fr_FR'])
        >>> matcher.name('🖥')
        'ordinateur de bureau'

        >>> matcher.name('🤔')
        'visage en pleine réflexion'

        >>> matcher = EmojiMatcher(languages=['de_DE'])
        >>> matcher.name('🤔')
        'Nachdenkender Smiley'

        >>> matcher.name('⚽')
        'Fußball'

        >>> matcher = EmojiMatcher(languages=['de_CH'])
        >>> matcher.name('🤔')
        'Nachdenkender Smiley'

        >>> matcher.name('⚽')
        'Fussball'

        >>> matcher.name('a')
        ''
        '''
        for language in _expand_languages(self._languages):
            if ((emoji_string, language) in self._emoji_dict
                    and 'names' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['names'][0]
        return ''

    def similar(self, emoji_string, match_limit=1000):
        '''Find similar emojis

        “Similar” means they share categories or keywords.

        :param emoji_string: The string of Unicode  characters which are
                             used to encode the emoji
        :type emoji_string: A string
        :rtype: A list of tuples of the form (<emoji>, <name>, <score>),
                i.e. a list like this:

                [('🐫', "cammello ['animale', 'gobba']", 2), ...]

                The name includes the list of categories or keywords
                which matched, the score is the number of categories
                or keywords matched.

                The list is sorted by preferred language, then score,
                then name.

        Examples:

        >>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

        >>> matcher.similar('this is not an emoji', match_limit = 5)
        []

        >>> matcher.similar('☺', match_limit = 5)
        [('☺', "white smiling face ['face', 'happy', 'outlined', 'people', 'relaxed', 'smile', 'smiley', 'so']", 8), ('😋', "face savouring delicious food ['face', 'happy', 'people', 'smile', 'smiley', 'so']", 6), ('😁', "grinning face with smiling eyes ['face', 'happy', 'people', 'smile', 'smiley', 'so']", 6), ('🙂', "slightly smiling face ['face', 'happy', 'people', 'smile', 'smiley', 'so']", 6), ('😍', "smiling face with heart-shaped eyes ['face', 'happy', 'people', 'smile', 'smiley', 'so']", 6)]

        >>> matcher = EmojiMatcher(languages = ['it_IT', 'en_US', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('☺', match_limit = 5)
        [('☺', "faccina sorridente ['contorno faccina sorridente', 'emozionarsi', 'faccina', 'sorridente']", 4), ('😺', "gatto che sorride ['faccina', 'sorridente']", 2), ('👽', "alieno ['faccina']", 1), ('👼', "angioletto ['faccina']", 1), ('🤑', "avidità di denaro ['faccina']", 1)]

        >>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', "bactrian camel ['animal', 'bactrian', 'camel', 'hump', 'hump day', 'nature', 'so', 'wildlife']", 8), ('🐪', "dromedary camel ['animal', 'hump', 'nature', 'so', 'wildlife']", 5), ('🐻', "bear face ['animal', 'nature', 'so', 'wildlife']", 4), ('🐦', "bird ['animal', 'nature', 'so', 'wildlife']", 4), ('🐡', "blowfish ['animal', 'nature', 'so', 'wildlife']", 4)]

        >>> matcher = EmojiMatcher(languages = [ 'it_IT', 'en_US','es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', "cammello ['animale', 'gobba']", 2), ('🐪', "dromedario ['animale', 'gobba']", 2), ('🐀', "Ratto ['animale']", 1), ('🐁', "Topo ['animale']", 1), ('\U0001f986', "anatra ['animale']", 1)]

        >>> matcher = EmojiMatcher(languages = ['de_DE', 'it_IT', 'en_US','es_MX', 'es_ES', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', "Kamel ['Tier', 'zweihöckrig']", 2), ('🐒', "Affe ['Tier']", 1), ('🐵', "Affengesicht ['Tier']", 1), ('🐜', "Ameise ['Tier']", 1), ('🐝', "Biene ['Tier']", 1)]

        >>> matcher = EmojiMatcher(languages = ['es_MX', 'it_IT', 'de_DE', 'en_US', 'es_ES', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', "camello ['animal', 'joroba']", 2), ('🐪', "dromedario ['animal', 'joroba']", 2), ('🐝', "abeja ['animal']", 1), ('🐋', "ballena ['animal']", 1), ('🐳', "ballena soplando un chorro de agua ['animal']", 1)]

        >>> matcher = EmojiMatcher(languages = ['es_ES',  'it_IT', 'es_MX', 'de_DE', 'en_US', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', "camello ['bactriano', 'camello', 'desierto', 'jorobas']", 4), ('🐪', "dromedario ['camello', 'desierto']", 2), ('🏜', "desierto ['desierto']", 1), ('🐫', "cammello ['animale', 'gobba']", 2), ('🐪', "dromedario ['animale', 'gobba']", 2)]

        >>> matcher = EmojiMatcher(languages = ['es_ES',  'it_IT', 'es_MX', 'de_DE', 'en_US', 'ja_JP'])
        >>> matcher.similar('€', match_limit = 10)
        [('؋', "afghani sign ['sc']", 1), ('֏', "armenian dram sign ['sc']", 1), ('₳', "austral sign ['sc']", 1), ('৻', "bengali ganda mark ['sc']", 1), ('৲', "bengali rupee mark ['sc']", 1), ('৳', "bengali rupee sign ['sc']", 1), ('₵', "cedi sign ['sc']", 1), ('¢', "cent sign ['sc']", 1), ('₡', "colon sign ['sc']", 1), ('₢', "cruzeiro sign ['sc']", 1)]
        '''
        candidate_scores = {}
        expanded_languages = _expand_languages(self._languages)
        for language in expanded_languages:
            emoji_key = (emoji_string, language)
            if emoji_key not in self._emoji_dict:
                continue
            original_labels_for_language = set()
            label_keys = ('categories', 'keywords')
            for label_key in label_keys:
                if label_key in self._emoji_dict[emoji_key]:
                    for label in self._emoji_dict[emoji_key][label_key]:
                        original_labels_for_language.add(label)
            for similar_key in self._emoji_dict:
                if similar_key[1] != language:
                    continue
                similar_string = similar_key[0]
                if 'names' in self._emoji_dict[similar_key]:
                    similar_name = self._emoji_dict[similar_key]['names'][0]
                else:
                    similar_name = self.name(similar_string)
                for label_key in label_keys:
                    if label_key in self._emoji_dict[similar_key]:
                        for label in self._emoji_dict[similar_key][label_key]:
                            if label in original_labels_for_language:
                                scores_key = (similar_string, language,
                                              similar_name)
                                if scores_key in candidate_scores:
                                    candidate_scores[scores_key].add(label)
                                else:
                                    candidate_scores[scores_key] = set([label])
        candidates = []
        for x in sorted(
                candidate_scores.items(),
                key=lambda x: (
                    expanded_languages.index(x[0][1]),  # language index
                    -len(x[1]),  # number of matching labels
                    -len(x[0][0]),  # length of emoji string
                    x[0][2],  # emoji name
                ))[:match_limit]:
            candidates.append(
                (x[0][0], x[0][2] + ' ' + repr(sorted(x[1])), len(x[1])))
        return candidates

    def debug_loading_data(self):
        '''To debug whether the data has been loaded correctly'''
        count = 0
        for key, value in sorted(self._emoji_dict.items()):
            print("key=%s value=%s" % (key, sorted(value.items())))
            count += 1
        print('count=%s' % count)
Exemplo n.º 42
0
    def _auxRefreshTree(self, tree_index):
        """
        Auxiliary function for refreshTree that recursively refreshes the
        tree nodes.

        If the underlying Python object has been changed, we don't want to
        delete the old tree model and create a new one from scratch because
        this loses all information about which nodes are fetched and expanded.
        Instead the old tree model is updated. Using the difflib from the
        standard library it is determined for a parent node which child nodes
        should be added or removed. This is done based on the node names only,
        not on the node contents (the underlying Python objects). Testing the
        underlying nodes for equality is potentially slow. It is faster to
        let the refreshNode function emit the dataChanged signal for all cells.
        """
        tree_item = self.treeItem(tree_index)
        logger.debug("_auxRefreshTree({}): {}{}".format(
            tree_index, tree_item.obj_path,
            "*" if tree_item.children_fetched else ""))

        if tree_item.children_fetched:

            old_items = tree_item.child_items
            new_items = self._fetchObjectChildren(tree_item.obj,
                                                  tree_item.obj_path)

            old_item_names = [(item.obj_name, item.is_attribute)
                              for item in old_items]
            new_item_names = [(item.obj_name, item.is_attribute)
                              for item in new_items]
            seqMatcher = SequenceMatcher(isjunk=None,
                                         a=old_item_names,
                                         b=new_item_names,
                                         autojunk=False)
            opcodes = seqMatcher.get_opcodes()

            logger.debug("(reversed) "
                         "opcodes: {}".format(list(reversed(opcodes))))

            for tag, i1, i2, j1, j2 in reversed(opcodes):

                if 1 or tag != 'equal':
                    logger.debug(
                        "  {:7s}, a[{}:{}] ({}), b[{}:{}] ({})".format(
                            tag, i1, i2, old_item_names[i1:i2], j1, j2,
                            new_item_names[j1:j2]))

                if tag == 'equal':
                    # Only when node names are equal is _auxRefreshTree
                    # called recursively.
                    assert i2 - i1 == j2 - j1, ("equal sanity "
                                                "check failed "
                                                "{} != {}".format(
                                                    i2 - i1, j2 - j1))
                    for old_row, new_row in zip(range(i1, i2), range(j1, j2)):
                        old_items[old_row].obj = new_items[new_row].obj
                        child_index = self.index(old_row, 0, parent=tree_index)
                        self._auxRefreshTree(child_index)

                elif tag == 'replace':
                    # Explicitly remove the old item and insert the new.
                    # The old item may have child nodes which indices must be
                    # removed by Qt, otherwise it crashes.
                    assert i2 - i1 == j2 - j1, ("replace sanity "
                                                "check failed "
                                                "{} != {}").format(
                                                    i2 - i1, j2 - j1)

                    # row number of first removed
                    first = i1
                    # row number of last element after insertion
                    last = i1 + i2 - 1
                    logger.debug("     calling "
                                 "beginRemoveRows({}, {}, {})".format(
                                     tree_index, first, last))
                    self.beginRemoveRows(tree_index, first, last)
                    del tree_item.child_items[i1:i2]
                    self.endRemoveRows()

                    # row number of first element after insertion
                    first = i1
                    # row number of last element after insertion
                    last = i1 + j2 - j1 - 1
                    logger.debug("     calling "
                                 "beginInsertRows({}, {}, {})".format(
                                     tree_index, first, last))
                    self.beginInsertRows(tree_index, first, last)
                    tree_item.insert_children(i1, new_items[j1:j2])
                    self.endInsertRows()

                elif tag == 'delete':
                    assert j1 == j2, ("delete"
                                      " sanity check "
                                      "failed. {} != {}".format(j1, j2))
                    # row number of first that will be removed
                    first = i1
                    # row number of last element after insertion
                    last = i1 + i2 - 1
                    logger.debug("     calling "
                                 "beginRemoveRows"
                                 "({}, {}, {})".format(tree_index, first,
                                                       last))
                    self.beginRemoveRows(tree_index, first, last)
                    del tree_item.child_items[i1:i2]
                    self.endRemoveRows()

                elif tag == 'insert':
                    assert i1 == i2, ("insert "
                                      "sanity check "
                                      "failed. {} != {}".format(i1, i2))
                    # row number of first element after insertion
                    first = i1
                    # row number of last element after insertion
                    last = i1 + j2 - j1 - 1
                    logger.debug("     "
                                 "calling beginInsertRows"
                                 "({}, {}, {})".format(tree_index, first,
                                                       last))
                    self.beginInsertRows(tree_index, first, last)
                    tree_item.insert_children(i1, new_items[j1:j2])
                    self.endInsertRows()
                else:
                    raise ValueError("Invalid tag: {}".format(tag))
Exemplo n.º 43
0
def make_changes(nlp,
                 source_sentence,
                 target_sentences=[],
                 min_count=2,
                 debug=False):
    source_tokens = get_tokens(nlp(str(source_sentence)))

    target_docs_tokens = [
        get_tokens(nlp(str(sent))) for sent in target_sentences
    ]
    all_actions = []

    for i in range(len(target_sentences)):

        target_tokens = target_docs_tokens[i]

        matcher = SequenceMatcher(None, source_tokens, target_tokens)

        raw_diffs = list(matcher.get_opcodes())

        for diff in raw_diffs:
            if diff[0] == 'replace':
                # "source_start_token", "source_end_token", "target_part"
                all_actions.append(('replace', diff[1], diff[2],
                                    "".join(target_tokens[diff[3]:diff[4]])))
            if diff[0] == 'delete':
                # "source_start_token", "source_end_token"
                all_actions.append(('delete', diff[1], diff[2]))
            if diff[0] == 'insert':
                # "source_start_token", "target_part"
                all_actions.append(('insert', diff[1],
                                    "".join(target_tokens[diff[3]:diff[4]])))

    good_actions = [
        k for k, v in Counter(all_actions).items() if v >= min_count
    ]
    good_actions.sort(key=lambda x: x[1])  # sort by second field - start token

    if debug:
        print("All actions", all_actions)
        print("Good actions", good_actions)

    if len(good_actions) > 0:

        final_text = ""
        current_start = 0
        previous_end = 0

        for action in good_actions:
            current_start = action[1]
            final_text += "".join(source_tokens[previous_end:current_start])
            if action[0] == 'replace':
                final_text += action[3]
                previous_end = action[2]
            if action[0] == 'delete':
                previous_end = action[2]
            if action[0] == 'insert':
                final_text += action[2]
                previous_end = action[1]

        final_text += "".join(source_tokens[previous_end:])
        return final_text

    else:
        return ''.join(source_tokens)
Exemplo n.º 44
0
def main():
    parser = argparse.ArgumentParser(
        "Plots the tuning characteristics of a model in "
        "classifying the mutation status of the genes in a given cohort.")

    parser.add_argument('cohort', type=str, help="a TCGA cohort")
    parser.add_argument('classif', help='a mutation classifier')

    args = parser.parse_args()
    os.makedirs(os.path.join(plot_dir), exist_ok=True)
    out_path = Path(os.path.join(base_dir, 'output', args.cohort))

    out_dirs = [
        out_dir.parent for out_dir in out_path.glob(
            "*/{}/**/out__task-0.p".format(args.classif))
        if (len(tuple(out_dir.parent.glob("out__*.p"))) > 0 and (
            len(tuple(out_dir.parent.glob("out__*.p"))) == len(
                tuple(out_dir.parent.glob("slurm/fit-*.txt")))))
    ]

    out_paths = [
        str(out_dir).split("/output/{}/".format(args.cohort))[1].split('/')
        for out_dir in out_dirs
    ]

    for genes, mut_levels in set(
        (out_path[0], out_path[3]) for out_path in out_paths):
        use_data = [(i, out_path) for i, out_path in enumerate(out_paths)
                    if out_path[0] == genes and out_path[3] == mut_levels]

        if len(use_data) > 1:
            use_samps = np.argmin(
                int(x[1][2].split('samps_')[-1]) for x in use_data)

            for x in use_data[:use_samps] + use_data[(use_samps + 1):]:
                del (out_dirs[x[0]])

    tune_list = [load_infer_tuning(str(out_dir)) for out_dir in out_dirs]
    mut_clf = set(clf for _, clf in tune_list)
    if len(mut_clf) != 1:
        raise ValueError("Each subvariant isolation experiment must be run "
                         "with exactly one classifier!")

    mut_clf = tuple(mut_clf)[0]
    out_modules = [
        str(out_dir).split("/output/{}/".format(args.cohort))[1].split('/')[0]
        for out_dir in out_dirs
    ]

    use_lvls = ['Gene']
    mut_lvls = [
        tuple(
            str(out_dir).split("/output/{}/".format(
                args.cohort))[1].split('/')[3].split('__'))
        for out_dir in out_dirs
    ]
    lvl_set = list(set(mut_lvls))

    seq_match = SequenceMatcher(a=lvl_set[0], b=lvl_set[1])
    for (op, start1, end1, start2, end2) in seq_match.get_opcodes():

        if op == 'equal' or op == 'delete':
            use_lvls += lvl_set[0][start1:end1]

        elif op == 'insert':
            use_lvls += lvl_set[1][start2:end2]

        elif op == 'replace':
            use_lvls += lvl_set[0][start1:end1]
            use_lvls += lvl_set[1][start2:end2]

    out_genes = reduce(
        or_, [set(out_module.split('_')) for out_module in out_modules])

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=list(set(out_genes)),
                           mut_levels=use_lvls,
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           var_source='mc3',
                           copy_source='Firehose',
                           annot_file=annot_file,
                           syn=syn,
                           cv_prop=1.0)

    iso_list = [load_infer_output(str(out_dir)) for out_dir in out_dirs]
    info_lists = [
        compare_scores(
            iso_df,
            cdata,
            get_similarities=False,
            all_mtype=reduce(or_, [
                MuType({
                    ('Gene', out_gene):
                    cdata.train_mut[out_gene].allkey(['Scale', 'Copy'] +
                                                     list(out_lvl))
                }) for out_gene in out_modl.split('_')
            ]))
        for iso_df, out_modl, out_lvl in zip(iso_list, out_modules, mut_lvls)
    ]

    tune_list = [lists[0] for lists in tune_list]
    auc_list = [lists[1] for lists in info_lists]
    size_list = [lists[2] for lists in info_lists]

    out_lists = [tune_list, auc_list, size_list, mut_clf, args]
    for i in range(3):
        out_lists[i] = pd.concat(out_lists[i]).sort_index()

    plot_tuning_auc(*out_lists)
Exemplo n.º 45
0
def compute_new_comment_positions(old_content, old_format, new_content,
                                  new_format, commentList):

    # cf. TextVersion.get_content
    previousVersionContent = pandoc_convert(old_content, old_format, 'html')
    newVersionContent = pandoc_convert(new_content, new_format, 'html')

    _, previous_char_list, span_starts_previous = spannify(
        previousVersionContent, False)
    _, new_char_list, span_starts_new = spannify(newVersionContent, False)

    sm = SequenceMatcher(None, previous_char_list, new_char_list)

    opcodes = sm.get_opcodes()
    to_remove_comments_ids = set()

    # limit to real comments (not replies) and those that have scope
    commentList = [
        c for c in commentList
        if not c.is_reply() and not c.is_scope_removed()
    ]

    for comment in commentList:
        try:
            comment.initial_start_offset = span_starts_previous[
                comment.start_wrapper] + comment.start_offset
            comment.initial_end_offset = span_starts_previous[
                comment.end_wrapper] + comment.end_offset
        except KeyError:
            logging.error(
                'Key error (wrapper out of bounds of span_starts_previous)')
            continue

        comment.computed_start_offset = comment.initial_start_offset
        comment.computed_end_offset = comment.initial_end_offset

        #        comment.computed_start_wrapper = None
        #        comment.computed_end_wrapper = None

        comment.valid = True
    for tag, i1, i2, j1, j2 in opcodes:
        #print tag, i1, i2, j1, j2

        for i in xrange(len(commentList)):
            if tag != 'equal':
                comment = commentList[i]
                if not comment.valid:
                    continue

                if comment.initial_start_offset >= i2:
                    # if offset
                    delta = ((j2 - j1) - (i2 - i1))
                    comment.computed_start_offset += delta
                    comment.computed_end_offset += delta

                elif comment.initial_end_offset > i1:
                    comment.valid = False

        #    id, initial_start, initial_end, computed_start, computed_end, valid = self.computationResults[i]

    for cc in commentList:
        if cc.valid:
            for id in xrange(len(span_starts_new.keys())):
                start = span_starts_new.get(id, 0)
                end = span_starts_new.get(id + 1, sys.maxint)

                # adjust start
                if cc.computed_start_offset >= start and cc.computed_start_offset < end:
                    cc.start_wrapper = id
                    cc.start_offset = cc.computed_start_offset - start

                # adjust end
                if cc.computed_end_offset >= start and cc.computed_end_offset < end:
                    cc.end_wrapper = id
                    cc.end_offset = cc.computed_end_offset - start

    # returns to_modify, to_remove
    return [c for c in commentList if c.valid], \
           [c for c in commentList if not c.valid]
Exemplo n.º 46
0
def make_dict(result_dic, raw_array, tagged_array):
    for raw_sent, tagged_sent in zip(raw_array, tagged_array):
        if not len(raw_sent) == len(tagged_sent):
            continue
        for raw_word, tag_word in zip(raw_sent, tagged_sent):

            tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+",
                                 tag_word)  # lookbehind 사용했으나 fixed되야되서 or로 처리
            merge_morph = "".join([remove_tag(morph) for morph in tag_morph])

            SM = SequenceMatcher(None, raw_word, merge_morph)
            opcodes = SM.get_opcodes()
            fraction, pyochung_list, dic_list, postag_list = [], [], [], []
            for morph_tag in tag_morph:
                morph, tag = nltk.str2tuple(morph_tag)
                for syl in morph:
                    fraction.append([syl, tag])
                fraction[-1][0] = fraction[-1][0] + '+'
                fraction[-1][1] = fraction[-1][1] + '+'  ##음절 뒤에 +붙이기
            fraction[-1][0] = fraction[-1][0][0]
            fraction[-1][1] = fraction[-1][1][:-1]

            if contain_equal(opcodes):  ##전부 같을 때

                for morph in tag_morph:
                    pyochung, postag = nltk.str2tuple(morph)

                    pyochung_list.append(pyochung)
                    dic_list.append(pyochung)
                    postag_list.append(postag)
                    continue

            elif not include_delete(opcodes):  ##Delete가 경우가 달라서 Delete만 따로
                pyo_temp, dic_temp, postag_temp = [], [], []
                for prev, curr, nxt in previous_and_next(
                        opcodes):  # insert, replace 처리

                    i1, i2, j1, j2 = curr[1], curr[2], curr[3], curr[4]
                    pyo_temp.append(raw_word[i1:i2])
                    dic_temp.append("".join([w[0] for w in fraction[j1:j2]]))
                    postag_temp.append("/".join(
                        [w[1] for w in fraction[j1:j2]]))

                    if curr[0] == "replace":
                        if prev != None:
                            pyo_temp, dic_temp, postag_temp = mor_replace(
                                pyo_temp, dic_temp, postag_temp, tag_morph,
                                fraction)
                            if nxt != None:  # replace,insert 이런경우도 잇을까?
                                if nxt[0] == "insert":
                                    print("리플레이스 인설트")
                                    print(raw_word)
                                    print(merge_morph)
                                    print(opcodes)
                        else:
                            mor_freplace(pyo_temp, dic_temp, postag_temp,
                                         tag_morph)

                    elif curr[0] == "insert":
                        if prev == None:  ##걸렸음  이어 이--05/NP이/VCP/어/EC 이게뭐옄ㅋㅋㅋ
                            ##                            print(raw_word)
                            ##                            print(merge_morph)
                            ##                            print(tag_word)
                            ##                            print(opcodes)
                            continue
                        pyo_temp, dic_temp, postag_temp = mor_insert(
                            pyo_temp, dic_temp, postag_temp, tag_morph,
                            fraction)
                postag_temp = del_dup(postag_temp)
                pyochung_list.extend(pyo_temp)
                dic_list.extend(dic_temp)
                postag_list.extend(postag_temp)

            else:  ##대망의 딜리트...

                mat_blocks = make_del_block(fraction, raw_word, merge_morph)

                ## 붙였는데 태깅중복되는것들 다시 합치기  딜리트를 쓰면 되게 쉬울것같았는데 지우면 바로 리스트에 인덱스가 전부 바뀌어버려서 생각보다 하드코딩함...
                merge_block = find_mergeblock(mat_blocks)

                merge_block_list = find_mergeblocklist(merge_block, mat_blocks)

                del_result_list = make_del_list(merge_block_list, mat_blocks)

                for block in del_result_list:
                    pyochung_list.append(block[0])
                    dic_list.append(block[1][0])
                    postag_list.extend(block[1][1])
                ##요정도까지 하면 전부 나오긴나오는데 + > 에대한 정의를 확실히 내리고 다시한번 봐야될듯 그리고 너무 하드코딩이라 고민해봐야됨

            ##사전에 넣는거는 그리어렵지 않으니 일단 월요일날 다시 가서 살펴봐야될

            for pyo, di, pos in zip(pyochung_list, dic_list, postag_list):
                count_dict(result_dic, str(pyo), [di, pos])
Exemplo n.º 47
0
            for t in self.checkPrintf(refSpecs, l10nValue):
                yield t
            return

    def checkPrintf(self, refSpecs, l10nValue):
        try:
            l10nSpecs = self.getPrintfSpecs(l10nValue)
        except PrintfException, e:
            yield ('error', e.pos, e.msg, 'printf')
            return
        if refSpecs != l10nSpecs:
            sm = SequenceMatcher()
            sm.set_seqs(refSpecs, l10nSpecs)
            msgs = []
            warn = None
            for action, ls, le, rs, re in sm.get_opcodes():
                if action == 'equal':
                    continue
                if action == 'delete':
                    # missing argument in l10n
                    if le == len(refSpecs):
                        # trailing specs missing, that's just a warning
                        warn = ', '.join('trailing argument %d `%s` missing' %
                                         (i + 1, refSpecs[i])
                                         for i in xrange(ls, le))
                    else:
                        for i in xrange(ls, le):
                            msgs.append('argument %d `%s` missing' %
                                        (i + 1, refSpecs[i]))
                    continue
                if action == 'insert':
Exemplo n.º 48
0
def transfer_casing_for_similar_text(text_w_casing, text_wo_casing):
    """Transferring the casing from one text to another - for similar
    (not matching) text

    1. It will use `difflib`'s `SequenceMatcher` to identify the
       different type of changes needed to turn `text_w_casing` into
       `text_wo_casing`
    2. For each type of change:

       - for inserted sections:

         - it will transfer the casing from the prior character
         - if no character before or the character before is the\
           space, then it will transfer the casing from the following\
           character

       - for deleted sections: no case transfer is required
       - for equal sections: just swap out the text with the original,\
         the one with the casings, as otherwise the two are the same
       - replaced sections: transfer the casing using\
         :meth:`transfer_casing_for_matching_text` if the two has the\
         same length, otherwise transfer character-by-character and\
         carry the last casing over to any additional characters.

    Parameters
    ----------
    text_w_casing : str
        Text with varied casing
    text_wo_casing : str
        Text that is in lowercase only

    Returns
    -------
    text_wo_casing : str
        If `text_wo_casing` is empty
    c : str
        Text with the content of `text_wo_casing` but the casing of
        `text_w_casing`

    Raises
    ------
    ValueError
        If `text_w_casing` is empty
    """
    if not text_wo_casing:
        return text_wo_casing

    if not text_w_casing:
        raise ValueError("We need 'text_w_casing' to know what "
                         "casing to transfer!")

    _sm = SequenceMatcher(None, text_w_casing.lower(), text_wo_casing)

    # we will collect the case_text:
    c = ''

    # get the operation codes describing the differences between the
    # two strings and handle them based on the per operation code rules
    for tag, i1, i2, j1, j2 in _sm.get_opcodes():
        # Print the operation codes from the SequenceMatcher:
        # print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'
        #       .format(tag, i1, i2, j1, j2,
        #               text_w_casing[i1:i2],
        #               text_wo_casing[j1:j2]))

        # inserted character(s)
        if tag == 'insert':
            # if this is the first character and so there is no
            # character on the left of this or the left of it a space
            # then take the casing from the following character
            if i1 == 0 or text_w_casing[i1 - 1] == ' ':
                if text_w_casing[i1] and text_w_casing[i1].isupper():
                    c += text_wo_casing[j1:j2].upper()
                else:
                    c += text_wo_casing[j1:j2].lower()
            else:
                # otherwise just take the casing from the prior
                # character
                if text_w_casing[i1 - 1].isupper():
                    c += text_wo_casing[j1:j2].upper()
                else:
                    c += text_wo_casing[j1:j2].lower()

        elif tag == 'delete':
            # for deleted characters we don't need to do anything
            pass

        elif tag == 'equal':
            # for 'equal' we just transfer the text from the
            # text_w_casing, as anyhow they are equal (without the
            # casing)
            c += text_w_casing[i1:i2]

        elif tag == 'replace':
            _w_casing = text_w_casing[i1:i2]
            _wo_casing = text_wo_casing[j1:j2]

            # if they are the same length, the transfer is easy
            if len(_w_casing) == len(_wo_casing):
                c += transfer_casing_for_matching_text(
                    text_w_casing=_w_casing, text_wo_casing=_wo_casing)
            else:
                # if the replaced has a different length, then we
                # transfer the casing character-by-character and using
                # the last casing to continue if we run out of the
                # sequence
                _last = 'lower'
                for w, wo in zip_longest(_w_casing, _wo_casing):
                    if w and wo:
                        if w.isupper():
                            c += wo.upper()
                            _last = 'upper'
                        else:
                            c += wo.lower()
                            _last = 'lower'
                    elif not w and wo:
                        # once we ran out of 'w', we will carry over
                        # the last casing to any additional 'wo'
                        # characters
                        c += wo.upper() if _last == 'upper' else wo.lower()
    return c
Exemplo n.º 49
0
)

bT = (
    TreeNode('first'),
    TreeNode('second', (
        TreeNode('second.first', (TreeNode('second.first.first'), )),
        TreeNode('second.second1'),
        TreeNode('second.second'),
    )),
    TreeNode('second1', (TreeNode(2), )),
    TreeNode('third', (TreeNode(2), )),
    TreeNode('fourth1'),
)

sm = SequenceMatcher(None, aT, bT)
top = sm.get_opcodes()
pprint(top)
print('---')

# Use a pseudo root with different root nodes.
pprint(
    TreeMatcher(hashableNodeImpl, TreeNode('a', aT),
                TreeNode('b', bT)).get_opcodes(), sys.stdout, 2, 40, None)

# Use a pseudo root with equal root nodes.
pprint(
    TreeMatcher(hashableNodeImpl, TreeNode(None, aT),
                TreeNode(None, bT)).get_opcodes(), sys.stdout, 2, 40, None)

# To generate a diff tree:
#
Exemplo n.º 50
0
class Changes(object):
    whitespaces = re.compile('\s+', re.UNICODE)
    first_tag = re.compile(r'^(#\w+)', re.UNICODE)

    def __init__(self, orig, changed):
        # type: (str, str) -> None
        # moved, must be thread safe
        self.differ = SequenceMatcher()
        self.stripped_tag = ''
        self.has_lines = self.check_bullet_list(orig)
        self.orig = self.split(orig)
        # self.changed = self.split(self.has_lines and changed or changed.replace('\n', ' ⏎<br/> '))
        self.changed = self.split(self.strip_tag(changed, orig))
        self.change_groups = list()
        self.do_compare()

    def strip_tag(self, changed, orig):
        # type: (unicode, unicode) -> unicode
        orig_match = self.first_tag.search(orig)

        def sub_tag(match):
            tag = match.group(1)
            if not orig_match or tag != orig_match.group(0):
                self.stripped_tag = tag + ' '
                return ''
            return tag

        return self.first_tag.sub(sub_tag, changed).strip()

    @staticmethod
    def check_bullet_list(text):
        # type: (unicode) -> bool
        bullets = '•*→-‐‑‒–—―‣'
        lines = text.splitlines()
        if len(lines) > 1:
            return any(line.strip()[0] in bullets for line in lines)

    @property
    def joiner(self):
        return self.has_lines and '\n' or ' '

    def split(self, txt):
        return self.has_lines and txt.splitlines() or self.whitespaces.split(
            txt.replace('\n', ' <br/> '))

    def join(self, groups, brief, no_deleted):
        if no_deleted:
            groups = filter(lambda g: g.state != 'delete', groups)
        return self.joiner.join(
            [cg.get_html(self.joiner, brief=brief) for cg in groups])

    def add_change_group(self, state, astart, aend, bstart, bend):
        if state == 'insert':
            parts = self.changed[bstart:bend]
        else:  # 'delete' och 'equal'
            parts = self.orig[astart:aend]
        self.change_groups.append(ChangeGroup(state, parts))

    def do_compare(self):
        self.differ.set_seqs(self.orig, self.changed)
        for opcode in self.differ.get_opcodes():
            state, positions = opcode[0], opcode[1:]
            if state == 'replace':
                self.add_change_group('delete', *positions)
                self.add_change_group('insert', *positions)
            else:
                self.add_change_group(*opcode)
        if self.change_groups:
            self.change_groups[0].first = True
            self.change_groups[-1].last = True

    def get_html(self, brief=False, no_deleted=False):
        return tags2links(self.stripped_tag) + self.join(
            self.change_groups, brief, no_deleted)
Exemplo n.º 51
0
def case_transfer_similar(cased_text: str, uncased_text: str) -> str:
    """Transfers the casing from one text to another - for similar (not matching)
    text.

    Use `difflib.SequenceMatcher` to identify the different type of changes
    needed to turn `cased_text` into `uncased_text`.

    - For inserted sections: transfer the casing from the prior character. If no
      character before or the character before is the space, transfer the casing
      from the following character.
    - For deleted sections: no case transfer is required.
    - For equal sections: swap out the text with the original, the cased one, a
      otherwise the two are the same.
    - For replaced sections: transfer the casing using
      :meth:`case_transfer_matching` if the two has the same length, otherwise
      transfer character-by-character and carry the last casing over to any
      additional characters.

    Args:
        cased_text: Text with varied casing.
        uncased_text: Text in lowercase.

    Returns:
        Text with the content of `uncased_text` but the casing of `cased_text`.

    Raises:
        ValueError: If `cased_text` is empty.
    """
    if not uncased_text:
        return uncased_text

    if not cased_text:
        raise ValueError("'cased_text' cannot be empty")

    matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text)
    result = ""

    for tag, ia1, ia2, ib1, ib2 in matcher.get_opcodes():
        if tag == "delete":
            continue
        if tag == "insert":
            # For the first character or space on the left, take the casing from
            # the following character. Else take case the prior character
            ia_ref = ia1 if ia1 == 0 or cased_text[ia1 - 1] == " " else ia1 - 1
            if cased_text[ia_ref].isupper():
                result += uncased_text[ib1:ib2].upper()
            else:
                result += uncased_text[ib1:ib2].lower()
        elif tag == "equal":
            # Transfer the text from the cased_text, as anyhow they are equal
            # (without the casing)
            result += cased_text[ia1:ia2]
        else:
            cased_seq = cased_text[ia1:ia2]
            uncased_seq = uncased_text[ib1:ib2]

            if len(cased_seq) == len(uncased_seq):
                result += case_transfer_matching(cased_seq, uncased_seq)
            else:
                # transfer the casing character-by-character and using the last
                # casing to continue if we run out of the sequence
                for cased, uncased in zip(cased_seq, uncased_seq):
                    result += uncased.upper() if cased.isupper(
                    ) else uncased.lower()
                # Apply casing from the last character of cased_seq to the rest
                # of the uncased_seq
                if len(cased_seq) < len(uncased_seq):
                    upper = cased_seq[-1].isupper()
                    idx = len(cased_seq)
                    result += "".join(
                        map(str.upper if upper else str.lower,
                            uncased_seq[idx:]))
    return result
Exemplo n.º 52
0
Arquivo: sed.py Projeto: maubot/sed
 def highlight_edits(cls, new_text: str, old_text: str, highlight: bool) -> str:
     if not highlight:
         return escape(new_text)
     matcher = SequenceMatcher(a=old_text, b=new_text)
     return "".join(cls.op_to_str(tag, old_text[old_start:old_end], new_text[new_start:new_end])
                    for tag, old_start, old_end, new_start, new_end in matcher.get_opcodes())
Exemplo n.º 53
0
    def _aux_refresh_tree(self, tree_index: core.ModelIndex):
        """Auxiliary function for refresh_tree that recursively refreshes the tree nodes.

        If the underlying Python object has been changed, we don't want to delete the old
        tree model and create a new one from scratch because this loses all information
        about which nodes are fetched and expanded. Instead the old tree model is updated.
        Using the difflib from the standard library it is determined for a parent node
        which child nodes should be added or removed. This is done based on the node names
        only, not on the node contents (the underlying Python objects). Testing the
        underlying nodes for equality is potentially slow. It is faster to let the
        refreshNode function emit the dataChanged signal for all cells.
        """
        tree_item = self.tree_item(tree_index)
        logger.debug(
            "_aux_refresh_tree(%s): %s%s",
            tree_index,
            tree_item.obj_path,
            "*" if tree_item.children_fetched else "",
        )

        if not tree_item.children_fetched:
            return None

        old_items = tree_item.child_items
        new_items = self._fetch_object_children(tree_item.obj,
                                                tree_item.obj_path)

        old_item_names = [(item.obj_name, item.is_attribute)
                          for item in old_items]
        new_item_names = [(item.obj_name, item.is_attribute)
                          for item in new_items]
        seqMatcher = SequenceMatcher(isjunk=None,
                                     a=old_item_names,
                                     b=new_item_names,
                                     autojunk=False)
        opcodes = seqMatcher.get_opcodes()

        logger.debug("(reversed) opcodes: %s", list(reversed(opcodes)))

        for tag, i1, i2, j1, j2 in reversed(opcodes):

            if 1 or tag != "equal":
                logger.debug("  {:7s}, a[{}:{}] ({}), b[{}:{}] ({})".format(
                    tag, i1, i2, old_item_names[i1:i2], j1, j2,
                    new_item_names[j1:j2]))

            if tag == "equal":
                # Only when node names are equal is _aux_refresh_tree called recursively.
                assert (
                    i2 - i1 == j2 -
                    j1), f"equal sanity check failed {i2 - i1} != {j2 - j1}"
                for old_row, new_row in zip(range(i1, i2), range(j1, j2)):
                    old_items[old_row].obj = new_items[new_row].obj
                    child_index = self.index(old_row, 0, parent=tree_index)
                    self._aux_refresh_tree(child_index)

            elif tag == "replace":
                # Explicitly remove the old item and insert the new. The old item may have
                # child nodes which indices must be removed by Qt, otherwise it crashes.
                assert (
                    i2 - i1 == j2 -
                    j1), f"replace sanity check failed {i2 - i1} != {j2 - j1}"

                first = i1  # row number of first that will be removed
                last = i1 + i2 - 1  # row number of last element after insertion
                with self.remove_rows(first, last, tree_index):
                    del tree_item.child_items[i1:i2]

                first = i1  # row number of first element after insertion
                last = i1 + j2 - j1 - 1  # row number of last element after insertion
                with self.insert_rows(first, last, tree_index):
                    tree_item.insert_children(i1, new_items[j1:j2])

            elif tag == "delete":
                assert j1 == j2, f"delete sanity check failed. {j1} != {j2}"
                first = i1  # row number of first that will be removed
                last = i1 + i2 - 1  # row number of last element after insertion
                with self.remove_rows(first, last, tree_index):
                    del tree_item.child_items[i1:i2]

            elif tag == "insert":
                assert i1 == i2, f"insert sanity check failed. {i1} != {i2}"
                first = i1
                last = i1 + j2 - j1 - 1
                with self.insert_rows(first, last, tree_index):
                    tree_item.insert_children(i1, new_items[j1:j2])
            else:
                raise ValueError(f"Invalid tag: {tag}")
Exemplo n.º 54
0
Arquivo: diff.py Projeto: peterbe/elmo
    def diffLines(self, path, action):
        lines = []
        try:
            p = getParser(path)
        except UserWarning:
            return None
        if action == 'added':
            a_entities = []
            a_map = {}
        else:
            realpath = (action == 'moved' and self.moved[path]
                        or action == 'copied' and self.copied[path] or path)
            data = self.ctx1.filectx(realpath).data()
            data = self._universal_newlines(data)
            try:
                p.readContents(data)
                a_entities, a_map = p.parse()
            except:
                # consider doing something like:
                # logging.warn('Unable to parse %s', path, exc_info=True)
                return None

        if action == 'removed':
            c_entities, c_map = [], {}
        else:
            data = self.ctx2.filectx(path).data()
            data = self._universal_newlines(data)
            try:
                p.readContents(data)
                c_entities, c_map = p.parse()
            except:
                # consider doing something like:
                # logging.warn('Unable to parse %s', path, exc_info=True)
                return None
        a_list = sorted(a_map.keys())
        c_list = sorted(c_map.keys())
        ar = AddRemove()
        ar.set_left(a_list)
        ar.set_right(c_list)
        for action, item_or_pair in ar:
            if action == 'delete':
                lines.append({
                    'class':
                    'removed',
                    'oldval': [{
                        'value': a_entities[a_map[item_or_pair]].val
                    }],
                    'newval':
                    '',
                    'entity':
                    item_or_pair
                })
            elif action == 'add':
                lines.append({
                    'class':
                    'added',
                    'oldval':
                    '',
                    'newval': [{
                        'value': c_entities[c_map[item_or_pair]].val
                    }],
                    'entity':
                    item_or_pair
                })
            else:
                oldval = a_entities[a_map[item_or_pair[0]]].val
                newval = c_entities[c_map[item_or_pair[1]]].val
                if oldval == newval:
                    continue
                sm = SequenceMatcher(None, oldval, newval)
                oldhtml = []
                newhtml = []
                for op, o1, o2, n1, n2 in sm.get_opcodes():
                    if o1 != o2:
                        oldhtml.append({'class': op, 'value': oldval[o1:o2]})
                    if n1 != n2:
                        newhtml.append({'class': op, 'value': newval[n1:n2]})
                lines.append({
                    'class': 'changed',
                    'oldval': oldhtml,
                    'newval': newhtml,
                    'entity': item_or_pair[0]
                })
        return lines
Exemplo n.º 55
0
class MacrogenDiff:
    def __init__(self, a: Union[Path, str], b: Union[Path, str]):
        self.a = DiffSide(a)
        self.b = DiffSide(b)
        self.matcher = SequenceMatcher(a=self.a.order, b=self.b.order)
        self.title = f"{self.a.title} : {self.b.title}"
        self.filename = f"order-{self.a.title}.{self.b.title}"

    def spearman(self):
        df = pd.DataFrame(
            dict(a=self.a.info.details.position,
                 b=self.b.info.details.position)).dropna()
        return spearmanr(df)

    def refinfo(self, ref: Reference, left_side: DiffSide,
                right_side: DiffSide):
        try:
            left, right = left_side.info.details.loc[
                ref], right_side.info.details.loc[ref]
            return [
                attrdiff(attr, left, right)
                for attr in ('max_before_date', 'min_after_date', 'rank')
            ]
        except KeyError as e:
            return ['', '', '']  # ['KeyError', e]

    def diff_order_table(self) -> HtmlTable:
        table = (HtmlTable().column('nicht vor', attrs={
            'class': 'right'
        }).column('nicht nach', attrs={
            'class': 'right'
        }).column('Rang', attrs={
            'class': 'right'
        }).column(self.a.title,
                  nodeformatter(self.a.title + '/'),
                  attrs={
                      'class': 'right border-right'
                  }).column(
                      self.b.title, nodeformatter(self.b.title + '/')).column(
                          'nicht vor').column('nicht nach').column('Rang'))
        for op, i1, i2, j1, j2 in self.matcher.get_opcodes():
            if op == "replace":
                for ref_a, ref_b in zip_longest(self.a.order[i1:i2],
                                                self.b.order[j1:j2]):
                    table.row(self.refinfo(ref_a, self.a, self.b) +
                              [ref_a or '', ref_b or ''] +
                              self.refinfo(ref_b, self.a, self.b),
                              class_='replace')
            elif op == "delete":
                for ref_a in self.a.order[i1:i2]:
                    table.row(self.refinfo(ref_a, self.a, self.b) +
                              [ref_a, '', '', '', ''],
                              class_='delete')
            elif op == "insert":
                for ref_b in self.b.order[j1:j2]:
                    table.row(['', '', '', '', ref_b] +
                              self.refinfo(ref_b, self.a, self.b),
                              class_='insert')
            elif op == "equal":
                table.row(SingleItem(
                    f'{i2 - i1} gleiche Referenzen ({_fmt_node(self.a.order[i1])} … {_fmt_node(self.a.order[i2 - 1])})'
                ),
                          class_='equal pure-center ignore')
        return table

    def conflict_diffs(self):
        def unsplit(node):
            if isinstance(node, SplitReference):
                return node.reference
            else:
                return node

        c_a = {(unsplit(u), unsplit(v))
               for u, v, k, attr in self.a.info.conflicts}
        c_b = {(unsplit(u), unsplit(v))
               for u, v, k, attr in self.b.info.conflicts}
        only_a = c_a - c_b
        only_b = c_b - c_a
        return only_a, only_b

    def conflict_diffs_html(self):
        result = ""
        for side, conflicts in zip([self.a, self.b], self.conflict_diffs()):
            result += f'<h2>{len(conflicts)} Konflikte nur in {side.title}</h2>'
            table = AssertionTable(prefix=side.title + '/')
            for u, v in conflicts:
                for w, x, k, attr in side.info.find_conflicts(u, v):
                    table.edge(w, x, attr)
            result += table.format_table()
        return result
Exemplo n.º 56
0
def check_dfs(da_unroll_df,
              ms_df,
              filenum,
              speaker,
              true_speaker,
              skip_nonverbal=True):
    ms_toks_df = ms_df[ms_df['word'] != '[silence]']
    ms_toks_df.loc[:, 'word_norm'] = ms_toks_df.word.apply(norm_ms)
    if skip_nonverbal:
        ms_toks_df = ms_toks_df[ms_toks_df['word_norm'] != nonverbal_token]
        da_unroll_df = da_unroll_df[
            da_unroll_df['da_token'] != nonverbal_token]
        da_unroll_df = da_unroll_df[da_unroll_df['da_token'] != '...']
        da_unroll_df = da_unroll_df.reset_index()
    ms_toks_df = split_ms_toks(ms_toks_df)
    ms_toks_df = ms_toks_df.reset_index()
    ms_toks_df.loc[:, 'ms_tok_id'] = range(len(ms_toks_df))
    ms_toks_df.loc[:, 'su_id'] = ms_toks_df.utt_id.apply(
        lambda x: int(x.split('-')[-1]))
    da_unroll_df.loc[:, 'da_tok_id'] = range(len(da_unroll_df))
    ms_toks_df = ms_toks_df.set_index('ms_tok_id')
    da_toks_df = da_unroll_df.set_index('da_tok_id')
    # .get_opcodes returns ops to turn a into b
    ms_side = ms_toks_df.word_norm.tolist()
    ms_side = [x.lower() for x in ms_side if x != nonverbal_token]
    da_side = da_toks_df.da_token.tolist()
    sseq = SequenceMatcher(None, ms_side, da_side)
    for info in sseq.get_opcodes():
        tag, i1, i2, j1, j2 = info
        # checking for switched sides:
        # if tag != "equal" and (i2-i1>=10 or j2-j1>=10):
        if tag != "equal":
            #print(tag, i1, i2, j1, j2, len(ms_side), len(da_side))
            start_i = max(0, min(i1, len(ms_side) - 1))
            end_i = max(i1, min(i2 - 1, len(ms_side) - 1))
            start_j = max(0, min(j1, len(da_side) - 1))
            end_j = max(j1, min(j2 - 1, len(da_side) - 1))
            #print(start_i, end_i, start_j, end_j)
            ms_part = ms_toks_df.loc[start_i:end_i]
            da_part = da_toks_df.loc[start_j:end_j]
            prev_ms_turn = ms_toks_df.loc[max(i1 - 1, 0)].su_id
            next_ms_turn = ms_toks_df.loc[min(i2, len(ms_side) - 1)].su_id
            prev_da_turn = da_toks_df.loc[max(j1 - 1, 0)].turn_id
            next_da_turn = da_toks_df.loc[min(j2, len(da_side) - 1)].turn_id
            ms_turns = ms_part['su_id'].values
            da_turns = da_part['turn_id'].values

            start_time = ms_part.start_time.values[0]
            end_time = ms_part.end_time.values[-1]
            time_span = end_time - start_time
            #print("{}\t{}\t{}\t{}\t{}\t{}\t{:6.4f}\t{}\t{}\t{}\t{}".format(\
            #        filenum, speaker, true_speaker, tag, i2-i1, j2-j1, \
            #        time_span, len(ms_turns), len(da_turns), \
            #        ms_side[i1:i2], da_side[j1:j2]))

            # only look at problem cases
            if (len(ms_side[i1:i2]) > 1 and len(da_side[j1:j2]) > 1 and
                (j2 - j1) != (i2 - i1)) and len(set(ms_turns)) != len(
                    set(da_turns)) and len(set(ms_turns)) != 1:
                if i2 - i1 >= j2 - j1: continue
                if len(set(da_turns)) == 1:
                    da_turn = da_turns[0]
                    ms_turn_counts = Counter(ms_turns)
                    ms_turn, turn_count = ms_turn_counts.most_common(1)[0]
                    if turn_count >= len(da_turns): continue
                print("{}\t{}\t{}\t{}\t{}\t{}\t{:6.4f}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(\
                            filenum, speaker, true_speaker, tag, i2-i1, j2-j1, \
                            time_span, prev_ms_turn, ms_turns, next_ms_turn, \
                            prev_da_turn, da_turns, next_da_turn, \
                            ms_side[i1:i2], da_side[j1:j2]))
Exemplo n.º 57
0
def get_revisions(rcstart=None):
    payload = {
        'action': 'query',
        'list': 'recentchanges',
        'rcshow': 'anon',
        'rcnamespace': '0',
        'rcprop': 'title|ids|sizes|timestamp|user',
        'rcstart': rcstart,
        'rcend': ts.strftime('%Y%m%d%H%M%S'),
        'format': 'json'
    }
    decoded_json = session.get('https://' + wiki + '.wikia.com/api.php',
                               params=payload,
                               headers=headers).json()

    for revision in reversed(decoded_json['query']['recentchanges']):
        payload = {'action': 'raw', 'oldid': revision['revid']}
        body_new = session.get('https://' + wiki + '.wikia.com/wiki/' +
                               revision['title'],
                               params=payload,
                               headers=headers).text.splitlines()

        payload = {'action': 'raw', 'oldid': revision['old_revid']}
        body_old = session.get('https://' + wiki + '.wikia.com/wiki/' +
                               revision['title'],
                               params=payload,
                               headers=headers).text.splitlines()

        # diff both pages
        result = list(d.compare(body_old, body_new))

        # fetch first difference
        line_left = ''
        line_right = ''
        done = False
        for line in result:
            if not done and line.startswith('-'):
                line_left = line[2:]
                done = True
        done = False
        for line in result:
            if not done and line.startswith('+'):
                line_right = line[2:]
                done = True

        sm = SequenceMatcher(None, line_left, line_right)
        output = []
        for opcode, a0, a1, b0, b1 in sm.get_opcodes():
            if opcode == 'equal':
                output.append(sm.a[a0:a1])
            elif opcode == 'insert':
                output.append("*" + sm.b[b0:b1] + "*")
            elif opcode == 'delete':
                output.append("~" + sm.a[a0:a1] + "~")
            elif opcode == 'replace':
                output.append("*" + sm.b[b0:b1] + "*")
        text = ''.join(output)

        message = revision['title'] + " *(" + str(
            revision['newlen'] - revision['oldlen']
        ) + ")* — " + text + " — <https://" + wiki + ".wikia.com/wiki/Special:Contributions/" + revision[
            'user'] + "|" + revision['user'] + ">"
        slack_client.api_call("chat.postMessage",
                              channel="recent-changes",
                              text=message,
                              as_user=True)
        #print(message)

    if 'query-continue' in decoded_json:
        return get_revisions(
            decoded_json['query-continue']['recentchanges']['rcstart'])
Exemplo n.º 58
0
def compare_output(s1, s2):
    """ Compare stdout strings s1 and s2.
        s1 is from readelf, s2 from elftools readelf.py
        Return pair success, errmsg. If comparison succeeds, success is True
        and errmsg is empty. Otherwise success is False and errmsg holds a
        description of the mismatch.

        Note: this function contains some rather horrible hacks to ignore
        differences which are not important for the verification of pyelftools.
        This is due to some intricacies of binutils's readelf which pyelftools
        doesn't currently implement, or silly inconsistencies in the output of
        readelf, which I was reluctant to replicate.
        Read the documentation for more details.
    """
    def prepare_lines(s):
        return [line for line in s.lower().splitlines() if line.strip() != '']

    def filter_readelf_lines(lines):
        filter_out = False
        for line in lines:
            if 'of the .eh_frame section' in line:
                filter_out = True
            elif 'of the .debug_frame section' in line:
                filter_out = False
            if not filter_out:
                if not line.startswith('unknown: length'):
                    yield line

    lines1 = prepare_lines(s1)
    lines2 = prepare_lines(s2)

    lines1 = list(filter_readelf_lines(lines1))

    flag_after_symtable = False

    if len(lines1) != len(lines2):
        return False, 'Number of lines different: %s vs %s' % (len(lines1),
                                                               len(lines2))

    for i in range(len(lines1)):
        if 'symbol table' in lines1[i]:
            flag_after_symtable = True

        # Compare ignoring whitespace
        lines1_parts = lines1[i].split()
        lines2_parts = lines2[i].split()
        if ''.join(lines1_parts) != ''.join(lines2_parts):
            ok = False
            sm = SequenceMatcher()
            sm.set_seqs(lines1[i], lines2[i])
            changes = sm.get_opcodes()
            if flag_after_symtable:
                # Detect readelf's adding @ with lib and version after
                # symbol name.
                if (len(changes) == 2 and changes[1][0] == 'delete'
                        and lines1[i][changes[1][1]] == '@'):
                    ok = True
            elif 'at_const_value' in lines1[i]:
                # On 32-bit machines, readelf doesn't correctly represent
                # some boundary LEB128 numbers
                num2 = int(lines2_parts[-1])
                if num2 <= -2**31 and '32' in platform.architecture()[0]:
                    ok = True
            elif 'os/abi' in lines1[i]:
                if 'unix - gnu' in lines1[i] and 'unix - linux' in lines2[i]:
                    ok = True
            else:
                for s in ('t (tls)', 'l (large)'):
                    if s in lines1[i] or s in lines2[i]:
                        ok = True
                        break
            if not ok:
                errmsg = 'Mismatch on line #%s:\n>>%s<<\n>>%s<<\n' % (
                    i, lines1[i], lines2[i])
                return False, errmsg
    return True, ''
Exemplo n.º 59
0
    
(note: "first...last" excludes the last chatacter
   eg: the last character is at "last"-1)"""
  print
  
if __name__ == "__main__":
  
  if len(argv) != 3:
    help()
    exit(1)
    
  else:
    file_a, file_b = argv[1:]
    
    a = open(file_a).read()
    b = open(file_b).read()
    
    s = SequenceMatcher(None, a, b)
    
    result = []
    
    for type, a1, a2, b1, b2 in s.get_opcodes():
      if type != 'equal':
        result.append( { 
          'type': type,
          'a': {'start': a1, 'end': a2}, #'text': a[a1:a2]},
          'b': {'start': b1, 'end': b2}, #'text': b[b1:b2]},
        } )
    
    print json.dumps(result)
  
Exemplo n.º 60
-1
def color_diff_strings(a, b):
  sm = SequenceMatcher(isjunk=lambda x: False, a=a, b=b, autojunk=False)

  outa = ''
  outb = ''

  for op in sm.get_opcodes():
    opcode = op[0]
    a1 = op[1]
    a2 = op[2]
    b1 = op[3]
    b2 = op[4]
    if opcode == 'replace':
      outa += DELETETEXT(a[a1:a2])
      outb += ADDTEXT(b[b1:b2])

    if opcode == 'delete':
      outa += DELETETEXT(a[a1:a2])
      outb += FILLTEXT(' ' * (a2-a1))

    if opcode == 'insert':
      outa += FILLTEXT(' ' * (b2-b1))
      outb += ADDTEXT(b[b1:b2])

    if opcode == 'equal':
      outa += a[a1:a2]
      outb += b[b1:b2]

  return (outa, outb)