示例#1
0
def fuzzymatch(string1):
    #note:  fuzzymatch.php must be in php path, e.g.  /usr/lib/php/!!!
    #put in a cron job that runs every half hour for new entries?
    
    entities = Session.query(Entity)
    
    
    matches = []
    
    ##string1 = string1.decode('utf8')
    
    for entity in entities:
        php = PHP("require 'fuzzymatch.php';")
        #php = PHP()
        #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n"
        
        code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");'
        
        #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';"
        #code = code + "print $string1; print $string2;"
        #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
        code = code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
        code = code + """print fuzzy_match($string1, $string2, 2);"""
        
        verdict = php.get_raw(code)
        #print "verdict is " + verdict + "\n"
    
        if float(verdict)>=.5:
            #print entity.label + " is a match!\n"
            entity.matchvalue = verdict
            matches.append(entity)
    
    return matches
示例#2
0
def fuzzymatch(string1):
    #note:  fuzzymatch.php must be in php path, e.g.  /usr/lib/php/!!!
    #put in a cron job that runs every half hour for new entries?

    entities = Session.query(Entity)

    matches = []

    ##string1 = string1.decode('utf8')

    for entity in entities:
        php = PHP("require 'fuzzymatch.php';")
        #php = PHP()
        #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n"

        code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");'

        #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';"
        #code = code + "print $string1; print $string2;"
        #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
        code = code + '$string2 = utf8_decode("' + entity.label.encode(
            'utf8') + '");'
        code = code + """print fuzzy_match($string1, $string2, 2);"""

        verdict = php.get_raw(code)
        #print "verdict is " + verdict + "\n"

        if float(verdict) >= .5:
            #print entity.label + " is a match!\n"
            entity.matchvalue = verdict
            matches.append(entity)

    return matches
示例#3
0
def fuzzymatchtest(string1, string2):
    #note:  fuzzymatch.php must be in php path, e.g.  /usr/lib/php/!!!
    php = PHP("require 'fuzzymatch.php';")
    #php = PHP()
    
    code = "$string1 = '" + string1 + "';"
    code = code + "$string2 = '" + string2.encode('latin-1', 'replace') + "';"
    #code = code + "print $string1; print $string2;"
    code = code + """print fuzzy_match($string1, $string2, 2);"""
    
    return php.get_raw(code)
示例#4
0
def fuzzymatchtest(string1, string2):
    #note:  fuzzymatch.php must be in php path, e.g.  /usr/lib/php/!!!
    php = PHP("require 'fuzzymatch.php';")
    #php = PHP()

    code = "$string1 = '" + string1 + "';"
    code = code + "$string2 = '" + string2.encode('latin-1', 'replace') + "';"
    #code = code + "print $string1; print $string2;"
    code = code + """print fuzzy_match($string1, $string2, 2);"""

    return php.get_raw(code)
示例#5
0
def fuzzymatchall(SEPEntrieslist):
    #takes outputs from addlist() and saves all fuzzy match IDs to SEPEntry.fuzzymatch with verdicts (percent of words matched)
    #now change so that it only updates ones that don't currently have a fuzzymatchlist
    
    #clear out fuzzymatch table--otherwise old fuzzies will accumulate, and nobody wants that
    delquery = Session.query(Fuzzymatch)
    delquery.delete()
    Session.flush()
    Session.commit()
    
    
    for SEPEntry in SEPEntrieslist:
            print "working on " + SEPEntry.title.encode('utf-8') + "\n"
            entities = Session.query(Entity)
            
            #exclude journals and nodes from fuzzy matching
            entities = entities.filter(Entity.typeID != 2)
            entities = entities.filter(Entity.typeID != 4)
            
            #reset fuzzymatches for that entry
            #SEPEntry.fuzzymatches = ""
    
            
            ##string1 = string1.decode('utf8')
            
            for entity in entities:
                php = PHP("set_include_path('/usr/lib/php/');")
                php = PHP("require 'fuzzymatch.php';")
                #php = PHP()
                #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n"
                
                code = '$string1 = utf8_decode("' + SEPEntry.title.encode('utf8') + '");'
                
                #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';"
                #code = code + "print $string1; print $string2;"
                #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
                code = code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
                code = code + """print fuzzy_match($string1, $string2, 2);"""
                
                verdict = php.get_raw(code)
                #print "verdict is " + verdict + "\n"
                verdict = verdict.split(',')
            
                if float(verdict[0])>=.20:
                    #print entity.label + " is a match!\n"
                    #entity.matchvalue = verdict
                    #string = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict
                    
                    #if len(string) < 400:
                    #    SEPEntry.fuzzymatches = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict
                    #else:
                    #    print "sorry, too many matches!  Can't add " + str(entity.ID) + " to fuzzy matches; over 400 chars."
                    fmatch = Fuzzymatch(entity.ID)
                    fmatch.sep_dir = SEPEntry.sep_dir
                    fmatch.strength = verdict[0]
                    fmatch.edits = verdict[1]
                    
                    SEPEntry.fmatches.append(fmatch)
                    
                    
            Session.flush()
            Session.commit()
示例#6
0
def fuzzymatchall(SEPEntrieslist):
    #takes outputs from addlist() and saves all fuzzy match IDs to SEPEntry.fuzzymatch with verdicts (percent of words matched)
    #now change so that it only updates ones that don't currently have a fuzzymatchlist

    #clear out fuzzymatch table--otherwise old fuzzies will accumulate, and nobody wants that
    delquery = Session.query(Fuzzymatch)
    delquery.delete()
    Session.flush()
    Session.commit()

    for SEPEntry in SEPEntrieslist:
        print "working on " + SEPEntry.title.encode('utf-8') + "\n"
        entities = Session.query(Entity)

        #exclude journals and nodes from fuzzy matching
        entities = entities.filter(Entity.typeID != 2)
        entities = entities.filter(Entity.typeID != 4)

        #reset fuzzymatches for that entry
        #SEPEntry.fuzzymatches = ""

        ##string1 = string1.decode('utf8')

        for entity in entities:
            php = PHP("set_include_path('/usr/lib/php/');")
            php = PHP("require 'fuzzymatch.php';")
            #php = PHP()
            #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n"

            code = '$string1 = utf8_decode("' + SEPEntry.title.encode(
                'utf8') + '");'

            #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';"
            #code = code + "print $string1; print $string2;"
            #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");'
            code = code + '$string2 = utf8_decode("' + entity.label.encode(
                'utf8') + '");'
            code = code + """print fuzzy_match($string1, $string2, 2);"""

            verdict = php.get_raw(code)
            #print "verdict is " + verdict + "\n"
            verdict = verdict.split(',')

            if float(verdict[0]) >= .20:
                #print entity.label + " is a match!\n"
                #entity.matchvalue = verdict
                #string = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict

                #if len(string) < 400:
                #    SEPEntry.fuzzymatches = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict
                #else:
                #    print "sorry, too many matches!  Can't add " + str(entity.ID) + " to fuzzy matches; over 400 chars."
                fmatch = Fuzzymatch(entity.ID)
                fmatch.sep_dir = SEPEntry.sep_dir
                fmatch.strength = verdict[0]
                fmatch.edits = verdict[1]

                SEPEntry.fmatches.append(fmatch)

        Session.flush()
        Session.commit()