def aggregate_counts(batches, counts, start_at, aggregate_over=None): counts_by_batch = dict([(c.batch_id, c) for c in counts]) batch_counts = [] for batch in batches: if batch.id in counts_by_batch: batch_counts.append((batch, counts_by_batch[batch.id].number)) else: batch_counts.append((batch, 0)) values = [(batch.created_on, compute_price(number, batch.total_messages)) for (batch, number) in batch_counts] return values
def process_batch(file_name): "Lock the file and process it." timestamp = int(file_name.split(".")[0]) log_msg("--- processing %s at %s ---" % (file_name, datetime.datetime.now().replace(microsecond=0))) pexpect.run("%s %s %s" % ("mv" if REMOVE_FROM_BATCH_QUEUE else "cp", os.path.join(BATCH_DIR, file_name), LOCK_DIR)) batch = open(os.path.join(LOCK_DIR, file_name)) frequencies = {} total_lines = 0 num_lines = 0 num_words = 0 t0 = time.time() while True: line = batch.readline() if not line: break try: data_object = loads(line) except ValueError: # if it's invalid json, just pass it continue total_lines += 1 if RESTRICT_TIMEZONES: try: if data_object["user"]["time_zone"] not in ALLOWED_TIMEZONES: continue except KeyError: continue try: text = data_object["text"] except KeyError: # not a status object continue num_lines += 1 words = process_text(text) num_words += len(words) update_frequencies(frequencies, words) batch.close() # kill the file. no need to keep them os.remove(os.path.join(LOCK_DIR, file_name)) total_words = len(frequencies.keys()) total_count = sum(frequencies.values()) t1 = time.time() log_msg( "%d lines (of %d), %d words, %d usages counted: %.01fs." % (num_lines, total_lines, total_words, total_count, (t1 - t0)) ) if total_words == 0: log_msg("skipping new batch...no content.") return # create new counts and measures new_batch = Batch.objects.create(total_messages=num_lines, created_on=datetime.datetime.fromtimestamp(timestamp)) num_words = Word.objects.all().count() num_counts = Count.objects.all().count() # Bulk insert all words, ignoring errors for content, number in frequencies.iteritems(): Word.objects.bulk_insert(content=content, send_pre_save=False) Word.objects.bulk_insert_commit(send_post_save=False, recover_pks=False) # Fetch the ids for new words so we can match them to counts words = Word.objects.filter(content__in=frequencies.keys()) word_ids = dict([(word.content, word.id) for word in words]) t2 = time.time() new_num_words = Word.objects.all().count() log_msg("%d new words created: %.01fs." % ((new_num_words - num_words), (t2 - t1))) # And bulk insert the new counts for content, number in frequencies.iteritems(): if content not in word_ids: # for some reason, the word wasn't successfully created by the # bulk insert new_word = Word.objects.create(content=content) word_ids[content] = new_word.id log_msg("*** failsafe create for '%s'." % content) Count.objects.bulk_insert(word=word_ids[content], batch=new_batch, number=number, send_pre_save=False) Count.objects.bulk_insert_commit(send_post_save=False, recover_pks=False) new_num_counts = Count.objects.all().count() t3 = time.time() log_msg("%d new counts created: %.01fs." % ((new_num_counts - num_counts), (t3 - t2))) # update round if Round.objects.count() > 0: t0 = time.time() maturations = [] latest_round = Round.objects.latest() holdings = Holding.objects.filter(user_round__is_active=True).exclude(quantity=0).select_related("word") for holding in holdings: try: number = frequencies[holding.word.content] except KeyError: number = 0 price = compute_price(number, new_batch.total_messages) holding.current_value = price * holding.quantity holding.save() user_rounds = UserRound.objects.filter(is_active=True) # aggregate into current value for user_round in user_rounds: user_round.update_current_value() t1 = time.time() log_msg( "%d holdings and %d user rounds updated: %.01fs." % (Holding.objects.count(), len(user_rounds), (t1 - t0)) ) # activate new measure new_batch.active = True new_batch.save() # update rounds. try: last_round = Round.objects.latest() last_number = last_round.number last_ends = last_round.ends_on except Round.DoesNotExist: last_round = None last_number = 0 last_ends = datetime.datetime.now() if datetime.datetime.now() >= last_ends: if last_round: # tweet win if settings.TWEET_WIN: try: winning_userround = (last_round.userround_set.order_by("-current_value"))[0] except IndexError: # no winner pass else: message = "Congrats to @%s for winning %s with $%s! #pretweeting" % ( winning_userround.user.username, last_round.name, priceformat.currency(winning_userround.current_value), ) access_token = settings.TWITTER_OAUTH_TOKEN token = oauth.OAuthToken.from_string(access_token) try: post_tweet(CONSUMER, CONNECTION, token, message) except TwitterError, e: log_msg(e) # create frozen copies of all user rounds for user_round in last_round.userround_set.filter(is_active=True, is_frozen=False): user_round.copy_frozen() # create next round new_number = last_number + 1 new_name = "Round %d" % new_number new_round = Round(number=new_number, name=new_name) new_round.started_on = datetime.datetime.now() new_round.ends_on = datetime.datetime.now() + settings.ROUND_LENGTH new_round.save()