def ret(term, tag): matches, s = matchstrings(term, tag) return s if s > 0.3 else 0
- string.lower().strip() - if string is delimiter, then return None (forced n-gram break) - remove parens, braces, brackets, colons, commas, pipes, etc. - remove ... - if string contains . inside (but not at end), then force delimiter (assumes this is a website) """ if not s: return '' s = s.lower().strip() if not isinstance(s, unicode): s = unicode(s, 'utf-8', 'ignore') for dlm in u'/ @ - | [ ] { } ; : vs. vs at versus « » › â –'.split(): #log(u'|%s| (%s) vs |%s| (%s)' % (s, type(s), dlm, type(dlm))) if s == dlm: return None s = u''.join(c for c in s if c not in u"'|()[]{},;`\"«»›–â") s = s.replace('...', '') if '.' in s and not s.endswith('.'): return None return s def inany(el, seq): """Returns the first sequence element that el is part of, else None""" for item in seq: if el in item: return item return None if __name__ == '__main__': a, b = sys.argv[1:3] print matchstrings(a, b)
- string.lower().strip() - if string is delimiter, then return None (forced n-gram break) - remove parens, braces, brackets, colons, commas, pipes, etc. - remove ... - if string contains . inside (but not at end), then force delimiter (assumes this is a website) """ if not s: return '' s = s.lower().strip() if not isinstance(s, unicode): s = unicode(s, 'utf-8', 'ignore') for dlm in u'/ @ - | [ ] { } ; : vs. vs at versus « » › â –'.split(): #log(u'|%s| (%s) vs |%s| (%s)' % (s, type(s), dlm, type(dlm))) if s == dlm: return None s = u''.join(c for c in s if c not in u"'|()[]{},;`\"«»›–â") s = s.replace('...','') if '.' in s and not s.endswith('.'): return None return s def inany(el, seq): """Returns the first sequence element that el is part of, else None""" for item in seq: if el in item: return item return None if __name__ == '__main__': a, b = sys.argv[1:3] print matchstrings(a, b)