def _get_candidates(dataset): for value in Value.all(dataset, eager_links=dataset.match_links): candidate = normalize(value.value, dataset) yield candidate, value.id if dataset.match_links: for link in value.links_static: candidate = normalize(link.key, dataset) yield candidate, value.id
def _get_candidates(dataset): for entity in Entity.all(dataset, eager_aliases=dataset.match_aliases): candidate = normalize(entity.name, dataset) yield candidate, entity.id if dataset.match_aliases: for link in entity.aliases_static: candidate = normalize(link.name, dataset) yield candidate, entity.id
def get_candidates(dataset): candidates = set() for value in Value.all(dataset, eager_links=dataset.match_links): candidate = normalize(value.value, dataset) candidates.add(candidate) yield candidate, value if dataset.match_links: for link in value.links_static: candidate = normalize(link.key, dataset) if candidate in candidates: continue candidates.add(candidate) yield candidate, value
def prefix_search(prefix, dataset): prefix_normalized = normalize(prefix, dataset) candidates = get_candidates(dataset) matches = [] entities = set() for candidate, entity_id in candidates: if candidate.startswith(prefix_normalized): if entity_id not in entities: entities.add(entity_id) matches.append((candidate, entity_id)) return matches
def _match(text, dataset, query=None): query = '' if query is None else query.strip().lower() text_normalized = normalize(text, dataset) matches = [] func = ALGORITHMS.get(dataset.algorithm, levenshtein) for candidate, value in get_candidates(dataset): if len(query) and query not in candidate.lower(): continue score = func(text_normalized, candidate) matches.append((candidate, value, score)) matches = sorted(matches, key=lambda (c,v,s): s, reverse=True) values = [] matches_uniq = [] for c,v,s in matches: if v in values: continue values.append(v) matches_uniq.append((c,v,s)) return matches_uniq
def match(text, dataset, query=None): query = '' if query is None else query.strip().lower() text_normalized = normalize(text, dataset) candidates = get_candidates(dataset) matches = [] begin = time.time() func = ALGORITHMS.get(dataset.algorithm, levenshtein) for candidate, value in candidates: if len(query) and query not in candidate.lower(): continue score = func(text_normalized, candidate) matches.append((candidate, value, score)) matches = sorted(matches, key=lambda (c,v,s): s, reverse=True) values = set() matches_uniq = [] for c,v,s in matches: if v in values: continue values.add(v) matches_uniq.append((c,v,s)) duration = time.time() - begin log.info("Matching %s candidates took: %sms", len(matches_uniq), duration*1000) return matches_uniq
def match(text, dataset, query=None): query = '' if query is None else query.strip() text_normalized = normalize(text, dataset) candidates = get_candidates(dataset) matches = [] begin = time.time() func = ALGORITHMS.get(dataset.algorithm, levenshtein) for candidate, entity_id in candidates: if len(query) and query not in candidate.lower(): continue score = func(text_normalized, candidate) matches.append((candidate, entity_id, score)) matches = sorted(matches, key=lambda (c, e, s): s, reverse=True) entities = set() matches_uniq = [] for c, e, s in matches: if e in entities: continue entities.add(e) matches_uniq.append((c, e, s)) duration = time.time() - begin log.info("Matching %s candidates took: %sms", len(matches_uniq), duration * 1000) return matches_uniq