示例#1
0
def getTagMatchFunc(matchtype, tags=None, layerkey=None):
    """Returns a function that takes in (queryterm, tag) and returns a score.
    The different matchtypes are:
           'exact': term exactly matches a tag
         'hasword': term is contained within a tag (respecting words)
        'contains': term is contained within a tag
       'unordered': term matches tag (in any order)
          'starts': term starts tag
            'ends': term ends tag
            'typo': term is misspelled version of a tag
            'path': term is close to another (by path similarity in wordnet)
         'synonym': term is a synonym of a tag
        'constant': returns 1
    You can optionally pass in a 'layerkey' as a unique id. This can be used for caches, etc.
    """
    import pickle
    from nkpylib.nknlp import matchstrings
    if matchtype == 'constant': ret = lambda term, tag: 1
    elif matchtype == 'exact': ret = lambda term, tag: 1 if term == tag else 0
    elif matchtype == 'hasword': ret = hasWord
    elif matchtype == 'contains': ret = lambda term, tag: 1 if term and (term in tag or tag in term) else 0
    elif matchtype == 'unordered':
        def ret(term, tag):
            matches, s = matchstrings(term, tag)
            return s if s > 0.3 else 0

    elif matchtype == 'starts': ret = lambda term, tag: 1 if tag.startswith(term) else 0
    elif matchtype == 'ends': ret = lambda term, tag: 1 if tag.endswith(term) else 0
    elif matchtype == 'typo': ret = lambda term, tag: strsim(term, tag) if strsim(term, tag) > 0.7 else 0
    elif matchtype == 'path': ret = lambda term, tag: wordnetsim(term, tag) if wordnetsim(term, tag) > 0.3 else 0
    elif matchtype == 'synonym': ret = lambda term, tag: 0 #TODO fix
    else:
        raise NotImplementedError()
    return (ret)
示例#2
0
def getTagMatchFunc(matchtype, tags=None, layerkey=None):
    """Returns a function that takes in (queryterm, tag) and returns a score.
    The different matchtypes are:
           'exact': term exactly matches a tag
         'hasword': term is contained within a tag (respecting words)
        'contains': term is contained within a tag
       'unordered': term matches tag (in any order)
          'starts': term starts tag
            'ends': term ends tag
            'typo': term is misspelled version of a tag
            'path': term is close to another (by path similarity in wordnet)
         'synonym': term is a synonym of a tag
        'constant': returns 1
    You can optionally pass in a 'layerkey' as a unique id. This can be used for caches, etc.
    """
    import pickle
    from nkpylib.nknlp import matchstrings
    if matchtype == 'constant': ret = lambda term, tag: 1
    elif matchtype == 'exact': ret = lambda term, tag: 1 if term == tag else 0
    elif matchtype == 'hasword': ret = hasWord
    elif matchtype == 'contains':
        ret = lambda term, tag: 1 if term and (term in tag or tag in term
                                               ) else 0
    elif matchtype == 'unordered':

        def ret(term, tag):
            matches, s = matchstrings(term, tag)
            return s if s > 0.3 else 0

    elif matchtype == 'starts':
        ret = lambda term, tag: 1 if tag.startswith(term) else 0
    elif matchtype == 'ends':
        ret = lambda term, tag: 1 if tag.endswith(term) else 0
    elif matchtype == 'typo':
        ret = lambda term, tag: strsim(term, tag) if strsim(term, tag
                                                            ) > 0.7 else 0
    elif matchtype == 'path':
        ret = lambda term, tag: wordnetsim(term, tag) if wordnetsim(
            term, tag) > 0.3 else 0
    elif matchtype == 'synonym':
        ret = lambda term, tag: 0  #TODO fix
    else:
        raise NotImplementedError()
    return (ret)
示例#3
0
def matchstrings(a, b):
    """Matches strings in a very loose way.
    Returns (matches, score), where matches is a list of pairs of matching
    words from a and b, and score is the final normalized similarity score
    (higher is better)."""
    from nkutils import utf, strsim
    import numpy as np
    np.set_printoptions(precision=2, linewidth=200, suppress=1)

    def norm(s):
        """Normalizes a string"""
        s = utf(s).strip().lower()
        return s

    def split(s):
        """Splits a string into components, quite aggressively"""
        import re
        els = re.split('\W+', s)
        els = map(norm, els)
        els = [e for e in els if e and e not in STOP_WORDS]
        return els

    els1 = split(norm(a))
    els2 = split(norm(b))
    #print '%s -> %s' % (a, els1)
    #print '%s -> %s' % (b, els2)
    matches = []
    best = 0.0
    if not els1 or not els2: return (matches, best)
    m = np.zeros((len(els1), len(els2)))
    for i, e1 in enumerate(els1):
        for j, e2 in enumerate(els2):
            c = m[i, j] = strsim(e1, e2)
            #print '  %d,%d = %s vs %s = %s' % (i, j, e1, e2, c)
    while 1:
        n = np.argmax(m)
        i, j = loc = np.unravel_index([n], m.shape)
        s = m[loc]
        #print '  %s, %s, %s' % (n, loc, s)
        if s <= 0: break
        # if we're here, then we want to add this match
        #print 'got %s, %s, %s, %s, %s' % (n, i, j, els1, els2)
        matches.append((els1[i[0]], els2[j[0]]))
        best += s
        m[i, :] = -1
        m[:, j] = -1
        #print '    picking %s, %s' % (matches[-1], best)
        #print m
    if matches:
        best /= float(len(matches))
    return (matches, best)
示例#4
0
def matchstrings(a, b):
    """Matches strings in a very loose way.
    Returns (matches, score), where matches is a list of pairs of matching
    words from a and b, and score is the final normalized similarity score
    (higher is better)."""
    from nkutils import utf, strsim
    import numpy as np
    np.set_printoptions(precision=2, linewidth=200, suppress=1)
    def norm(s):
        """Normalizes a string"""
        s = utf(s).strip().lower()
        return s

    def split(s):
        """Splits a string into components, quite aggressively"""
        import re
        els = re.split('\W+', s)
        els = map(norm, els)
        els = [e for e in els if e and e not in STOP_WORDS]
        return els

    els1 = split(norm(a))
    els2 = split(norm(b))
    #print '%s -> %s' % (a, els1)
    #print '%s -> %s' % (b, els2)
    matches = []
    best = 0.0
    if not els1 or not els2: return (matches, best)
    m = np.zeros((len(els1), len(els2)))
    for i, e1 in enumerate(els1):
        for j, e2 in enumerate(els2):
            c = m[i,j] = strsim(e1, e2)
            #print '  %d,%d = %s vs %s = %s' % (i, j, e1, e2, c)
    while 1:
        n = np.argmax(m)
        i, j = loc = np.unravel_index([n], m.shape)
        s = m[loc]
        #print '  %s, %s, %s' % (n, loc, s)
        if s <= 0: break
        # if we're here, then we want to add this match
        #print 'got %s, %s, %s, %s, %s' % (n, i, j, els1, els2)
        matches.append((els1[i[0]], els2[j[0]]))
        best += s
        m[i,:] = -1
        m[:,j] = -1
        #print '    picking %s, %s' % (matches[-1], best)
        #print m
    if matches:
        best /= float(len(matches))
    return (matches, best)