Exemplo n.º 1
0
 def deindex(db, key, doc, tokens, **kwargs):
     housenumbers = doc.get('housenumbers', {})
     for hn, data in housenumbers.items():
         for token in tokens:
             k = '|'.join(['didx', hn, token])
             commons = db.zinterstore(k, [keys.token_key(hn),
                                          keys.token_key(token)])
             db.delete(k)
             if not commons:
                 db.srem(pair_key(hn), token)
                 db.srem(pair_key(token), hn)
Exemplo n.º 2
0
 def deindex(db, key, doc, tokens, **kwargs):
     tokens = list(set(tokens))  # Unique values.
     for i, token in enumerate(tokens):
         for token2 in tokens[i:]:
             if token != token2:
                 tmp_key = '|'.join(['didx', token, token2])
                 # Do we have other documents that share token and token2?
                 commons = db.zinterstore(tmp_key, [keys.token_key(token),
                                                    keys.token_key(token2)])
                 db.delete(tmp_key)
                 if not commons:
                     db.srem(pair_key(token), token2)
                     db.srem(pair_key(token2), token)
Exemplo n.º 3
0
def housenumbers_pairs_deindexer(db, key, doc, tokens, **kwargs):
    for field, value in doc.items():
        field = field.decode()
        if not field.startswith('h|'):
            continue
        number, lat, lon, *extra = value.decode().split('|')
        hn = field[2:]
        for token in tokens:
            k = '|'.join(['didx', hn, token])
            commons = db.zinterstore(k, [keys.token_key(hn),
                                         keys.token_key(token)])
            db.delete(k)
            if not commons:
                db.srem(pair_key(hn), token)
                db.srem(pair_key(token), hn)
Exemplo n.º 4
0
def pairs_deindexer(db, key, doc, tokens, **kwargs):
    els = list(set(tokens))  # Unique values.
    loop = 0
    for el in els:
        for el2 in els[loop:]:
            if el != el2:
                key = '|'.join(['didx', el, el2])
                # Do we have other documents that share el and el2?
                commons = db.zinterstore(key, [keys.token_key(el),
                                               keys.token_key(el2)])
                db.delete(key)
                if not commons:
                    db.srem(pair_key(el), el2)
                    db.srem(pair_key(el2), el)
        loop += 1
Exemplo n.º 5
0
 def deindex(db, key, doc, tokens, **kwargs):
     els = list(set(tokens))  # Unique values.
     loop = 0
     for el in els:
         for el2 in els[loop:]:
             if el != el2:
                 key = '|'.join(['didx', el, el2])
                 # Do we have other documents that share el and el2?
                 commons = db.zinterstore(key, [keys.token_key(el),
                                                keys.token_key(el2)])
                 db.delete(key)
                 if not commons:
                     db.srem(pair_key(el), el2)
                     db.srem(pair_key(el2), el)
         loop += 1
Exemplo n.º 6
0
def housenumbers_pairs_deindexer(db, key, doc, tokens, **kwargs):
    for field, value in doc.items():
        field = field.decode()
        if not field.startswith('h|'):
            continue
        number, lat, lon, *extra = value.decode().split('|')
        hn = field[2:]
        for token in tokens:
            k = '|'.join(['didx', hn, token])
            commons = db.zinterstore(
                k,
                [keys.token_key(hn), keys.token_key(token)])
            db.delete(k)
            if not commons:
                db.srem(pair_key(hn), token)
                db.srem(pair_key(token), hn)
Exemplo n.º 7
0
def try_fuzzy(helper, tokens, include_common=True):
    if not helper.bucket_dry or not tokens:
        return
    helper.debug('Fuzzy on. Trying with %s.', tokens)
    tokens.sort(key=lambda t: len(t), reverse=True)
    allkeys = helper.keys[:]
    if include_common:
        # As we are in fuzzy, try to narrow as much as possible by adding
        # unused common tokens.
        allkeys.extend(
            [t.db_key for t in helper.common if t.db_key not in helper.keys])
    for try_one in tokens:
        if helper.bucket_full:
            break
        keys = allkeys[:]
        if try_one.db_key in keys:
            keys.remove(try_one.db_key)
        if try_one.isdigit():
            continue
        helper.debug('Going fuzzy with %s and %s', try_one, keys)
        neighbors = make_fuzzy(try_one, max=helper.fuzzy)
        if len(keys):
            # Only retain tokens that have been seen in the index at least
            # once with the other tokens.
            DB.sadd(helper.pid, *neighbors)
            interkeys = [pair_key(k[2:]) for k in keys]
            interkeys.append(helper.pid)
            fuzzy_words = DB.sinter(interkeys)
            DB.delete(helper.pid)
            # Keep the priority we gave in building fuzzy terms (inversion
            # first, then substitution, etc.).
            fuzzy_words = [w.decode() for w in fuzzy_words]
            fuzzy_words.sort(key=lambda x: neighbors.index(x))
        else:
            # The token we are considering is alone.
            fuzzy_words = []
            for neighbor in neighbors:
                key = dbkeys.token_key(neighbor)
                count = DB.zcard(key)
                if count:
                    fuzzy_words.append(neighbor)
        if fuzzy_words:
            helper.debug('Found fuzzy candidates %s', fuzzy_words)
            fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words]
            for key in fuzzy_keys:
                if helper.bucket_dry:
                    helper.add_to_bucket(keys + [key])
Exemplo n.º 8
0
def try_fuzzy(helper, tokens, include_common=True):
    if not helper.bucket_dry or not tokens:
        return
    helper.debug('Fuzzy on. Trying with %s.', tokens)
    tokens.sort(key=lambda t: len(t), reverse=True)
    allkeys = helper.keys[:]
    if include_common:
        # As we are in fuzzy, try to narrow as much as possible by adding
        # unused commons tokens.
        common = [t for t in helper.common if t.db_key not in helper.keys]
        allkeys.extend([t.db_key for t in common])
    for try_one in tokens:
        if helper.bucket_full:
            break
        keys = allkeys[:]
        if try_one.db_key in keys:
            keys.remove(try_one.db_key)
        if try_one.isdigit():
            continue
        helper.debug('Going fuzzy with %s', try_one)
        neighbors = make_fuzzy(try_one, max=helper.fuzzy)
        if len(keys):
            # Only retains tokens that have been seen in the index at least
            # once with the other tokens.
            DB.sadd(helper.query, *neighbors)
            interkeys = [pair_key(k[2:]) for k in keys]
            interkeys.append(helper.query)
            fuzzy_words = DB.sinter(interkeys)
            DB.delete(helper.query)
            # Keep the priority we gave in building fuzzy terms (inversion
            # first, then substitution, etc.).
            fuzzy_words = [w.decode() for w in fuzzy_words]
            fuzzy_words.sort(key=lambda x: neighbors.index(x))
        else:
            # The token we are considering is alone.
            fuzzy_words = []
            for neighbor in neighbors:
                key = dbkeys.token_key(neighbor)
                count = DB.zcard(key)
                if count:
                    fuzzy_words.append(neighbor)
        helper.debug('Found fuzzy candidates %s', fuzzy_words)
        fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words]
        for key in fuzzy_keys:
            if helper.bucket_dry:
                helper.add_to_bucket(keys + [key])
Exemplo n.º 9
0
def do_fuzzyindex(self, word):
    """Compute fuzzy extensions of word that exist in index.
    FUZZYINDEX lilas"""
    word = list(preprocess_query(word))[0]
    token = Token(word)
    neighbors = make_fuzzy(token)
    neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in neighbors]
    neighbors.sort(key=lambda n: n[1], reverse=True)
    for token, freq in neighbors:
        if freq == 0:
            break
        print(white(token), blue(freq))
Exemplo n.º 10
0
def do_fuzzyindex(self, word):
    """Compute fuzzy extensions of word that exist in index.
    FUZZYINDEX lilas"""
    word = list(preprocess_query(word))[0]
    token = Token(word)
    token.make_fuzzy()
    neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in token.neighbors]
    neighbors.sort(key=lambda n: n[1], reverse=True)
    for token, freq in neighbors:
        if freq == 0:
            break
        print(white(token), blue(freq))
Exemplo n.º 11
0
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False):
    helper.debug('Autocompleting %s', helper.last_token)
    keys = [t.db_key for t in tokens if not t.is_last]
    pair_keys = [pair_key(t) for t in tokens if not t.is_last]
    key = edge_ngram_key(helper.last_token)
    autocomplete_tokens = DB.sinter(pair_keys + [key])
    helper.debug('Found tokens to autocomplete %s', autocomplete_tokens)
    for token in autocomplete_tokens:
        key = dbkeys.token_key(token.decode())
        if skip_commons\
           and token_key_frequency(key) > config.COMMON_THRESHOLD:
            helper.debug('Skip common token to autocomplete %s', key)
            continue
        if not helper.bucket_overflow or helper.last_token in helper.not_found:
            helper.debug('Trying to extend bucket. Autocomplete %s', key)
            extra_keys = [key]
            if use_geohash and helper.geohash_key:
                extra_keys.append(helper.geohash_key)
            helper.add_to_bucket(keys + extra_keys)
Exemplo n.º 12
0
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False):
    helper.debug('Autocompleting %s', helper.last_token)
    # helper.last_token.autocomplete()
    keys = [t.db_key for t in tokens if not t.is_last]
    pair_keys = [pair_key(t) for t in tokens if not t.is_last]
    key = edge_ngram_key(helper.last_token)
    autocomplete_tokens = DB.sinter(pair_keys + [key])
    helper.debug('Found tokens to autocomplete %s', autocomplete_tokens)
    for token in autocomplete_tokens:
        key = dbkeys.token_key(token.decode())
        if skip_commons\
           and token_key_frequency(key) > config.COMMON_THRESHOLD:
            helper.debug('Skip common token to autocomplete %s', key)
            continue
        if not helper.bucket_overflow or helper.last_token in helper.not_found:
            helper.debug('Trying to extend bucket. Autocomplete %s', key)
            extra_keys = [key]
            if use_geohash and helper.geohash_key:
                extra_keys.append(helper.geohash_key)
            helper.add_to_bucket(keys + extra_keys)
Exemplo n.º 13
0
 def deindex(db, key, doc, tokens, **kwargs):
     if config.INDEX_EDGE_NGRAMS:
         for token in tokens:
             tkey = dbkeys.token_key(token)
             if not DB.exists(tkey):
                 deindex_edge_ngrams(token)
Exemplo n.º 14
0
 def key(self):
     if not hasattr(self, '_key'):
         self._key = keys.token_key(self)
     return self._key
Exemplo n.º 15
0
 def key(self):
     if not hasattr(self, '_key'):
         self._key = keys.token_key(self)
     return self._key
Exemplo n.º 16
0
def edge_ngram_deindexer(db, key, doc, tokens, **kwargs):
    if config.INDEX_EDGE_NGRAMS:
        for token in tokens:
            tkey = dbkeys.token_key(token)
            if not DB.exists(tkey):
                deindex_edge_ngrams(token)