Exemplo n.º 1
0
def aggregate_counts(batches, counts, start_at, aggregate_over=None):

    counts_by_batch = dict([(c.batch_id, c) for c in counts])

    batch_counts = []
    for batch in batches:
        if batch.id in counts_by_batch:
            batch_counts.append((batch, counts_by_batch[batch.id].number))
        else:
            batch_counts.append((batch, 0))
    
    values = [(batch.created_on, compute_price(number, batch.total_messages))
            for (batch, number) in batch_counts]
        
    return values
Exemplo n.º 2
0
def process_batch(file_name):
    "Lock the file and process it."

    timestamp = int(file_name.split(".")[0])

    log_msg("--- processing %s at %s ---" % (file_name, datetime.datetime.now().replace(microsecond=0)))
    pexpect.run("%s %s %s" % ("mv" if REMOVE_FROM_BATCH_QUEUE else "cp", os.path.join(BATCH_DIR, file_name), LOCK_DIR))

    batch = open(os.path.join(LOCK_DIR, file_name))

    frequencies = {}

    total_lines = 0
    num_lines = 0
    num_words = 0

    t0 = time.time()
    while True:

        line = batch.readline()
        if not line:
            break
        try:
            data_object = loads(line)
        except ValueError:
            # if it's invalid json, just pass it
            continue

        total_lines += 1

        if RESTRICT_TIMEZONES:
            try:
                if data_object["user"]["time_zone"] not in ALLOWED_TIMEZONES:
                    continue
            except KeyError:
                continue

        try:
            text = data_object["text"]
        except KeyError:
            # not a status object
            continue

        num_lines += 1

        words = process_text(text)
        num_words += len(words)
        update_frequencies(frequencies, words)

    batch.close()

    # kill the file. no need to keep them
    os.remove(os.path.join(LOCK_DIR, file_name))

    total_words = len(frequencies.keys())
    total_count = sum(frequencies.values())
    t1 = time.time()
    log_msg(
        "%d lines (of %d), %d words, %d usages counted: %.01fs."
        % (num_lines, total_lines, total_words, total_count, (t1 - t0))
    )

    if total_words == 0:
        log_msg("skipping new batch...no content.")
        return

    # create new counts and measures
    new_batch = Batch.objects.create(total_messages=num_lines, created_on=datetime.datetime.fromtimestamp(timestamp))

    num_words = Word.objects.all().count()
    num_counts = Count.objects.all().count()

    # Bulk insert all words, ignoring errors
    for content, number in frequencies.iteritems():
        Word.objects.bulk_insert(content=content, send_pre_save=False)

    Word.objects.bulk_insert_commit(send_post_save=False, recover_pks=False)

    # Fetch the ids for new words so we can match them to counts
    words = Word.objects.filter(content__in=frequencies.keys())
    word_ids = dict([(word.content, word.id) for word in words])

    t2 = time.time()
    new_num_words = Word.objects.all().count()
    log_msg("%d new words created: %.01fs." % ((new_num_words - num_words), (t2 - t1)))

    # And bulk insert the new counts
    for content, number in frequencies.iteritems():
        if content not in word_ids:
            # for some reason, the word wasn't successfully created by the
            # bulk insert
            new_word = Word.objects.create(content=content)
            word_ids[content] = new_word.id
            log_msg("*** failsafe create for '%s'." % content)

        Count.objects.bulk_insert(word=word_ids[content], batch=new_batch, number=number, send_pre_save=False)

    Count.objects.bulk_insert_commit(send_post_save=False, recover_pks=False)

    new_num_counts = Count.objects.all().count()

    t3 = time.time()

    log_msg("%d new counts created: %.01fs." % ((new_num_counts - num_counts), (t3 - t2)))

    # update round
    if Round.objects.count() > 0:
        t0 = time.time()

        maturations = []

        latest_round = Round.objects.latest()
        holdings = Holding.objects.filter(user_round__is_active=True).exclude(quantity=0).select_related("word")
        for holding in holdings:
            try:
                number = frequencies[holding.word.content]
            except KeyError:
                number = 0
            price = compute_price(number, new_batch.total_messages)
            holding.current_value = price * holding.quantity
            holding.save()

        user_rounds = UserRound.objects.filter(is_active=True)
        # aggregate into current value
        for user_round in user_rounds:
            user_round.update_current_value()

        t1 = time.time()

        log_msg(
            "%d holdings and %d user rounds updated: %.01fs." % (Holding.objects.count(), len(user_rounds), (t1 - t0))
        )

    # activate new measure
    new_batch.active = True
    new_batch.save()

    # update rounds.
    try:
        last_round = Round.objects.latest()
        last_number = last_round.number
        last_ends = last_round.ends_on
    except Round.DoesNotExist:
        last_round = None
        last_number = 0
        last_ends = datetime.datetime.now()

    if datetime.datetime.now() >= last_ends:

        if last_round:
            # tweet win
            if settings.TWEET_WIN:
                try:
                    winning_userround = (last_round.userround_set.order_by("-current_value"))[0]
                except IndexError:
                    # no winner
                    pass
                else:
                    message = "Congrats to @%s for winning %s with $%s! #pretweeting" % (
                        winning_userround.user.username,
                        last_round.name,
                        priceformat.currency(winning_userround.current_value),
                    )

                    access_token = settings.TWITTER_OAUTH_TOKEN
                    token = oauth.OAuthToken.from_string(access_token)
                    try:
                        post_tweet(CONSUMER, CONNECTION, token, message)
                    except TwitterError, e:
                        log_msg(e)

            # create frozen copies of all user rounds
            for user_round in last_round.userround_set.filter(is_active=True, is_frozen=False):
                user_round.copy_frozen()

        # create next round
        new_number = last_number + 1
        new_name = "Round %d" % new_number
        new_round = Round(number=new_number, name=new_name)
        new_round.started_on = datetime.datetime.now()
        new_round.ends_on = datetime.datetime.now() + settings.ROUND_LENGTH
        new_round.save()