예제 #1
0
def prefix_search(prefix, dataset):
    prefix_normalized = normalize(prefix, dataset)
    candidates = get_candidates(dataset)
    matches = []
    entities = set()
    for candidate, entity_id in candidates:
        if candidate.startswith(prefix_normalized):
            if entity_id not in entities:
                entities.add(entity_id)
                matches.append((candidate, entity_id))
    return matches
예제 #2
0
def prefix_search(prefix, dataset):
    prefix_normalized = normalize(prefix, dataset)
    candidates = get_candidates(dataset)
    matches = []
    entities = set()
    for candidate, entity_id in candidates:
        if candidate.startswith(prefix_normalized):
            if entity_id not in entities:
                entities.add(entity_id)
                matches.append((candidate, entity_id))
    return matches
예제 #3
0
def match(text, dataset, query=None):
    query = '' if query is None else query.strip()
    text_normalized = normalize(text, dataset)
    candidates = get_candidates(dataset)
    matches = []
    begin = time.time()
    func = ALGORITHMS.get(dataset.algorithm, levenshtein)
    for candidate, entity_id in candidates:
        if len(query) and query not in candidate.lower():
            continue
        score = func(text_normalized, candidate)
        matches.append((candidate, entity_id, score))
    matches = sorted(matches, key=lambda (c,e,s): s, reverse=True)
    entities = set()
    matches_uniq = []
    for c,e,s in matches:
        if e in entities:
            continue
        entities.add(e)
        matches_uniq.append((c,e,s))
    duration = time.time() - begin
    log.info("Matching %s candidates took: %sms",
            len(matches_uniq), duration*1000)
    return matches_uniq
예제 #4
0
def match(text, dataset, query=None):
    query = '' if query is None else query.strip()
    text_normalized = normalize(text, dataset)
    candidates = get_candidates(dataset)
    matches = []
    begin = time.time()
    func = ALGORITHMS.get(dataset.algorithm, levenshtein)
    for candidate, entity_id in candidates:
        if len(query) and query not in candidate.lower():
            continue
        score = func(text_normalized, candidate)
        matches.append((candidate, entity_id, score))
    matches = sorted(matches, key=lambda (c, e, s): s, reverse=True)
    entities = set()
    matches_uniq = []
    for c, e, s in matches:
        if e in entities:
            continue
        entities.add(e)
        matches_uniq.append((c, e, s))
    duration = time.time() - begin
    log.info("Matching %s candidates took: %sms", len(matches_uniq),
             duration * 1000)
    return matches_uniq