Exemplo n.º 1
0
def remove_false_positives(qstring, candidate_strings, candidate_string_attrs=None):
    start_time = int(round(time() * 1000000))

    qlength = len(qstring)

    if candidate_string_attrs and len(candidate_strings) > CAND_STRINGS_THRESHOLD:
        qelements = get_string_elements(qstring)
        filtered_candidate_strings = list()

        for string in candidate_strings:
            elements, length = candidate_string_attrs[string]
            if ed_property_is_satisfied(qelements, elements, qlength == length):
                filtered_candidate_strings.append(string)

        #print '# of candidate strings before filtering: %s' % len(candidate_strings)
        #print '# of candidate strings after filtering: %s' % len(filtered_candidate_strings)

        candidate_strings = filtered_candidate_strings

    approximate_matches = list()

    for string in candidate_strings:
        length = len(string)
        is_not_false_positive = strings_are_within_distance_K(qstring, string, qlength, length, K=ED_THRESHOLD+1)

        if is_not_false_positive:
            approximate_matches.append(string)

    end_time = int(round(time() * 1000000))

    return approximate_matches, end_time - start_time
Exemplo n.º 2
0
    def _create_dense_index(strings):
        dense_index = dict()

        for i, string in enumerate(strings):
            string_elements = get_string_elements(string)
            dense_index[i] = (string, string_elements, len(string))

        set_dense_index(dense_index)
        if VERBOSITY:
            print 'Created dense index'