Exemplo n.º 1
0
def get_candidate_string_ids(qstring):
    qgrams = get_qgrams_from_string(qstring, QGRAM_LENGTH)
    if not qgrams:
        return list()

    qlength = len(qstring)
    valid_lengths = range(qlength-ED_THRESHOLD, qlength+ED_THRESHOLD+1)
    candidate_string_ids = list()

    for length in valid_lengths:
        string_ids = solve_T_occurence_problem(qlength, length, qgrams)
        if string_ids:
            candidate_string_ids += string_ids

    assert len(candidate_string_ids) == len(set(candidate_string_ids))

    return candidate_string_ids
Exemplo n.º 2
0
    def _create_inverted_index(strings):
        inverted_index = dict()

        for string_id, string in enumerate(strings):
            string_len = len(string)

            try:
                inverted_index_len = inverted_index[string_len]
            except KeyError:
                inverted_index[string_len] = dict()
                inverted_index_len = inverted_index[string_len]

            qgrams = get_qgrams_from_string(string, QGRAM_LENGTH)

            for qgram in qgrams:
                try:
                    inverted_index_len[qgram].add(string_id)
                except KeyError:
                    inverted_index_len[qgram] = set([string_id])

        set_inverted_index(inverted_index)
        if VERBOSITY:
            print 'Created inverted index'