Exemplo n.º 1
0
def compute_name_frequencies():
    """Compute a numeric distribution of name frequencies."""
    # Count how often each name part (i.e. token) shows up across
    # the whole of the dataset or a sample.
    pipe = kv.pipeline(transaction=False)
    pipe.delete(TOKEN_KEY)
    names_count = 0
    for idx, token in enumerate(iter_tokens()):
        pipe.hincrby(TOKEN_KEY, token, 1)
        names_count += 1
        if idx > 0 and idx % 10000 == 0:
            pipe.execute()
            pipe = kv.pipeline(transaction=False)
    pipe.execute()
    log.info("Names: %d", names_count)

    total = 0
    distinct = 0
    max_count = 0
    for name, count in kv.hscan_iter(TOKEN_KEY):
        count = int(count)
        if count == 1:
            continue
        distinct += 1
        total += count
        max_count = max((count, max_count))

    log.info("Total: %d, distinct: %d, max: %d", total, distinct, max_count)
    pipe.set(MAX_KEY, max_count)
    pipe.set(TOTAL_KEY, total)
    pipe.execute()
Exemplo n.º 2
0
def compute_name_frequencies():
    """Compute a numeric distribution of name frequencies."""
    # Count how often each name part (i.e. token) shows up across
    # the whole of the dataset or a sample.
    # This is very memory-intense and could be sent out to redis.
    # Doing it in redis is also icky because of the need to iterate
    # the data later, and because it would need to be fully reset
    # before each run of this. Maybe a hash would be a useful
    # structure here?
    pipe = kv.pipeline(transaction=False)
    pipe.delete(TOKEN_KEY)
    names_count = 0
    for idx, token in enumerate(iter_tokens()):
        pipe.hincrby(TOKEN_KEY, token, 1)
        names_count += 1
        if idx > 0 and idx % 10000 == 0:
            pipe.execute()
            pipe = kv.pipeline(transaction=False)
    pipe.execute()
    log.info("Names: %d, unique: %d", names_count, kv.hlen(TOKEN_KEY))

    # Next, count how often each count occurs, i.e. make a histogram
    # of name frequency.
    counts = {}
    max_count = 0
    for _, count in kv.hscan_iter(TOKEN_KEY):
        count = int(count)
        # Leave out one-offs because they skew and aren't really
        # useful in any way.
        if count == 1:
            continue
        if count not in counts:
            counts[count] = 0
        counts[count] += 1
        # Find out what the maximum count is.
        max_count = max((count, max_count))

    log.info("Counts: %d, max: %d", len(counts), max_count)
    total = 0
    pipe = kv.pipeline(transaction=False)
    pipe.delete(DIST_KEY)
    for idx in range(max_count, 1, -1):
        total += counts.get(idx, 0)
        pipe.hset(DIST_KEY, idx, total)
        if idx > 0 and idx % 10000 == 0:
            pipe.execute()
            pipe = kv.pipeline(transaction=False)
    log.info("Total: %d", total)
    pipe.set(TOTAL_KEY, total)
    pipe.execute()
Exemplo n.º 3
0
def load_places():
    if kv.get(PLACE_KEY) or settings.TESTING:
        return
    total = 0
    pipe = kv.pipeline(transaction=False)
    log.info("Loading geonames...")
    with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh:
        for row in csv.reader(fh, delimiter='\t'):
            country = row[8].lower().strip()
            if not len(country):
                continue
            names = set(row[3].split(','))
            names.add(row[1])
            names.add(row[2])
            for name in names:
                name = tag_key(name)
                if name is not None:
                    total += 1
                    pipe.lpush(place_key(name), country)
    pipe.set(PLACE_KEY, total)
    pipe.execute()
    log.info("Loaded %s geonames.", total)
Exemplo n.º 4
0
Arquivo: util.py Projeto: pudo/aleph
def load_places():
    if kv.get(PLACE_KEY) or settings.TESTING:
        return
    total = 0
    pipe = kv.pipeline(transaction=False)
    log.debug("Loading geonames...")
    with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh:
        for row in csv.reader(fh, delimiter='\t'):
            country = row[8].lower().strip()
            if not len(country):
                continue
            names = set(row[3].split(','))
            names.add(row[1])
            names.add(row[2])
            for name in names:
                name = normalize_label(name)
                if name is not None:
                    total += 1
                    pipe.lpush(place_key(name), country)
    pipe.set(PLACE_KEY, total)
    pipe.execute()
    log.debug("Loaded %s geonames.", total)