Пример #1
0
 def ret(term, tag):
     matches, s = matchstrings(term, tag)
     return s if s > 0.3 else 0
Пример #2
0
 def ret(term, tag):
     matches, s = matchstrings(term, tag)
     return s if s > 0.3 else 0
Пример #3
0
        - string.lower().strip()
        - if string is delimiter, then return None (forced n-gram break)
        - remove parens, braces, brackets, colons, commas, pipes, etc.
        - remove ...
        - if string contains . inside (but not at end), then force delimiter (assumes this is a website)
    """
    if not s: return ''
    s = s.lower().strip()
    if not isinstance(s, unicode):
        s = unicode(s, 'utf-8', 'ignore')
    for dlm in u'/ @ - | [ ] { } ; : vs. vs at versus « » › â –'.split():
        #log(u'|%s| (%s) vs |%s| (%s)' % (s, type(s), dlm, type(dlm)))
        if s == dlm: return None
    s = u''.join(c for c in s if c not in u"'|()[]{},;`\"«»›–â")
    s = s.replace('...', '')
    if '.' in s and not s.endswith('.'): return None
    return s


def inany(el, seq):
    """Returns the first sequence element that el is part of, else None"""
    for item in seq:
        if el in item:
            return item
    return None


if __name__ == '__main__':
    a, b = sys.argv[1:3]
    print matchstrings(a, b)
Пример #4
0
        - string.lower().strip()
        - if string is delimiter, then return None (forced n-gram break)
        - remove parens, braces, brackets, colons, commas, pipes, etc.
        - remove ...
        - if string contains . inside (but not at end), then force delimiter (assumes this is a website)
    """
    if not s: return ''
    s = s.lower().strip()
    if not isinstance(s, unicode):
        s = unicode(s, 'utf-8', 'ignore')
    for dlm in u'/ @ - | [ ] { } ; : vs. vs at versus « » › â –'.split():
        #log(u'|%s| (%s) vs |%s| (%s)' % (s, type(s), dlm, type(dlm)))
        if s == dlm: return None
    s = u''.join(c for c in s if c not in u"'|()[]{},;`\"«»›–â")
    s = s.replace('...','')
    if '.' in s and not s.endswith('.'): return None
    return s

def inany(el, seq):
    """Returns the first sequence element that el is part of, else None"""
    for item in seq:
        if el in item:
            return item
    return None



if __name__ == '__main__':
    a, b = sys.argv[1:3]
    print matchstrings(a, b)