Exemplo n.º 1
0
def get_lshtein_shortlist(srcfile, tgtfiles, listsize=3):
    slist = [None]*listsize
    dummylist = []
    for t in tgtfiles:
        dummylist.append(t)
    for i in range(listsize):
        slist[i] = dummylist[0]
        dist = lshtein.distance(strip_text(srcfile['html']), strip_text(slist[i]['html']))
        for t in dummylist:
            ldist = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html']))
            if ldist < dist:
                slist[i] = t
                dist = ldist
        dummylist.remove(slist[i])
    return slist
Exemplo n.º 2
0
def get_lshtein_shortlist(srcfile, tgtfiles, listsize=3):
    slist = [None] * listsize
    dummylist = []
    for t in tgtfiles:
        dummylist.append(t)
    for i in range(listsize):
        slist[i] = dummylist[0]
        dist = lshtein.distance(strip_text(srcfile['html']),
                                strip_text(slist[i]['html']))
        for t in dummylist:
            ldist = lshtein.distance(strip_text(srcfile['html']),
                                     strip_text(t['html']))
            if ldist < dist:
                slist[i] = t
                dist = ldist
        dummylist.remove(slist[i])
    return slist
Exemplo n.º 3
0
def get_match(srcfile, slist, hweight, nweight, uweight, c, v):
    match = slist[0]
    bconf = 0.0
    if len(slist) > 0:
        for t in slist:
            html = lshtein.distance(strip_text(srcfile['html']),
                                    strip_text(t['html']))
            html_rel = float(len(strip_text(srcfile['html'])) - html) / len(
                strip_text(srcfile['html']))

            srcnumbers = set(get_numbers(srcfile['html']))
            num_rel = float(
                len(srcnumbers & set(get_numbers(t['html'])))) / float(
                    len(srcnumbers))

            url = lshtein.distance(srcfile['url'], t['url'])
            url_rel = float(len(srcfile['url']) - url) / len(srcfile['url'])

            conf = url_rel * uweight + num_rel * nweight + html_rel * hweight
            if v:
                print "TGTFILE: ", t['filename']
                print "url: ", url_rel
                print "num: ", num_rel
                print "html: ", html_rel
                print "conf:", conf

            if conf > bconf:
                match = t
                bconf = conf

        if bconf < c:  #below threshhold
            return None

        if match['match'] != None:
            if srcfile['conf'] < bconf:  #if better match
                match['match']['match'] = None
                match['match']['conf'] = 0.0
                match['match'] = srcfile
                return (match, bconf)
            else:
                return None  #srcfile not matched

        match['match'] = srcfile
        return (match, bconf)
Exemplo n.º 4
0
def get_match(srcfile, slist, hweight, nweight, uweight, c, v):
    match = slist[0]
    bconf = 0.0
    if len(slist) > 0:
        for t in slist:
            html = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html']))
            html_rel = float(len(strip_text(srcfile['html']))-html) / len(strip_text(srcfile['html']))
            
            srcnumbers = set(get_numbers(srcfile['html']))
            num_rel = float(len(srcnumbers & set(get_numbers(t['html'])))) / float(len(srcnumbers))
            
            url = lshtein.distance(srcfile['url'], t['url'])
            url_rel = float(len(srcfile['url']) - url) / len(srcfile['url'])
            
            conf = url_rel*uweight + num_rel*nweight + html_rel*hweight
            if v:
                print "TGTFILE: ", t['filename']
                print "url: ", url_rel
                print "num: ", num_rel
                print "html: ", html_rel
                print "conf:", conf
            
            if conf > bconf:
                match = t
                bconf = conf
        
        if bconf < c: #below threshhold
            return None
                
        if match['match'] != None:
            if srcfile['conf'] < bconf: #if better match
                match['match']['match'] = None
                match['match']['conf'] = 0.0
                match['match'] = srcfile
                return (match, bconf)
            else:
                return None #srcfile not matched
        
        match['match'] = srcfile
        return (match, bconf)
Exemplo n.º 5
0
 def test_basic_distance(self):
     """Tests distance correctness with a few basic values"""
     assert lshtein.distance("word", "word") == 0
     assert lshtein.distance("word", "") == 4
     assert lshtein.distance("", "word") == 4
     assert lshtein.distance("word", "word 2") == 2
     assert lshtein.distance("words", "word") == 1
     assert lshtein.distance("word", "woord") == 1
 def test_basic_distance(self):
     """Tests distance correctness with a few basic values"""
     levenshtein = lshtein.LevenshteinComparer()
     assert lshtein.distance("word", "word") == 0
     assert lshtein.distance("word", "") == 4
     assert lshtein.distance("", "word") == 4
     assert lshtein.distance("word", "word 2") == 2
     assert lshtein.distance("words", "word") == 1
     assert lshtein.distance("word", "woord") == 1