def migrate(metrics=None): """ Migrate _BACKLOG_DAYS worth of previously-collected tweets to the Taurus non-metric-data RabbitMQ exchange. :param metrics: optional sequence of metric names; if specified (not None), the migration will be limited to tweets corresponding to the given metric names. """ startingTimestamp = datetime.utcnow() - timedelta(days=_BACKLOG_DAYS) sqlEngine = collectorsdb.engineFactory() # Retrieve the first twitter samples sequence number in the desired range # select min(seq) from twitter_tweet_samples where agg_ts >= startingTimestamp @collectorsdb.retryOnTransientErrors def queryMigrationRange(): sel = (sql.select( [sql.func.count(), sql.func.min(schema.twitterTweetSamples.c.seq), sql.func.max(schema.twitterTweetSamples.c.seq)]) .where(schema.twitterTweetSamples.c.agg_ts >= startingTimestamp)) if metrics is not None: sel = sel.where(schema.twitterTweetSamples.c.metric.in_(metrics)) return sqlEngine.execute(sel).first() totalNumItems, minSeq, maxSeq = queryMigrationRange() if totalNumItems == 0: g_log.info("Nothing forwarded: no tweet samples found since %s UTC", startingTimestamp.isoformat()) return if metrics is None: g_log.info("Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq) else: g_log.info("Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s; metrics=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq, metrics) # Retrieve and publish batches totalNumPublished = 0 totalNumBatches = 0 batchMinSeq = minSeq with MessageBusConnector() as messageBus: while True: batchEndSeq, batch = TweetForwarder.queryNonMetricTweetBatch( sqlEngine=sqlEngine, minSeq=batchMinSeq, maxItems=200, maxSeq=maxSeq, metrics=metrics) if batchEndSeq is None: break TweetForwarder.publishNonMetricTweetBatch(messageBus=messageBus, batch=batch) totalNumPublished += len(batch) totalNumBatches += 1 g_log.debug("Published numItems=%d; batchMinSeq=%s; batchEndSeq=%s " "(%d of %d: %s%%)", len(batch), batchMinSeq, batchEndSeq, totalNumPublished, totalNumItems, int(float(totalNumPublished)/totalNumItems * 100)) if (totalNumBatches % 250) == 0 or totalNumPublished == totalNumItems: # Progress report g_log.info( "Published %d of %d: %s%%", totalNumPublished, totalNumItems, int(float(totalNumPublished)/totalNumItems * 100)) # Prepare for next query batchMinSeq = batchEndSeq + 1 g_log.info("Done publishing! publishedBatches=%d, publishedItems=%d, " "expectedItems=%d; minSeq=%s, maxSeq=%s", totalNumBatches, totalNumPublished, totalNumItems, minSeq, maxSeq)
def main(): """ NOTE: main also serves as entry point for "console script" generated by setup """ logging_support.LoggingSupport.initTool() _parseArgs() startingTimestamp = datetime.utcnow() - timedelta(days=_BACKLOG_DAYS) sqlEngine = collectorsdb.engineFactory() # Retrieve the first twitter samples sequence number in the desired range # select min(seq) from twitter_tweet_samples where agg_ts >= startingTimestamp @collectorsdb.retryOnTransientErrors def queryMigrationRange(): sel = sql.select( [sql.func.count(), sql.func.min(schema.twitterTweetSamples.c.seq), sql.func.max(schema.twitterTweetSamples.c.seq)] ).where(schema.twitterTweetSamples.c.agg_ts >= startingTimestamp) return sqlEngine.execute(sel).first() totalNumItems, minSeq, maxSeq = queryMigrationRange() if totalNumItems == 0: g_log.info("Nothing forwarded: no tweet samples found since %s UTC", startingTimestamp.isoformat()) return g_log.info("Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq) # Retrieve and publish batches totalNumPublished = 0 totalNumBatches = 0 batchMinSeq = minSeq with MessageBusConnector() as messageBus: while True: batchEndSeq, batch = TweetForwarder.queryNonMetricTweetBatch( sqlEngine=sqlEngine, minSeq=batchMinSeq, maxItems=200, maxSeq=maxSeq) if batchEndSeq is None: break TweetForwarder.publishNonMetricTweetBatch(messageBus=messageBus, batch=batch) totalNumPublished += len(batch) totalNumBatches += 1 g_log.debug("Published numItems=%d; batchMinSeq=%s; batchEndSeq=%s " "(%d of %d: %s%%)", len(batch), batchMinSeq, batchEndSeq, totalNumPublished, totalNumItems, int(float(totalNumPublished)/totalNumItems * 100)) if (totalNumBatches % 250) == 0 or totalNumPublished == totalNumItems: # Progress report g_log.info( "Published %d of %d: %s%%", totalNumPublished, totalNumItems, int(float(totalNumPublished)/totalNumItems * 100)) # Prepare for next query batchMinSeq = batchEndSeq + 1 g_log.info("Done publishing! publishedBatches=%d, publishedItems=%d, " "expectedItems=%d; minSeq=%s, maxSeq=%s", totalNumBatches, totalNumPublished, totalNumItems, minSeq, maxSeq)
def main(): """ NOTE: main also serves as entry point for "console script" generated by setup """ logging_support.LoggingSupport.initTool() _parseArgs() startingTimestamp = datetime.utcnow() - timedelta(days=_BACKLOG_DAYS) sqlEngine = collectorsdb.engineFactory() # Retrieve the first twitter samples sequence number in the desired range # select min(seq) from twitter_tweet_samples where agg_ts >= startingTimestamp @collectorsdb.retryOnTransientErrors def queryMigrationRange(): sel = sql.select([ sql.func.count(), sql.func.min(schema.twitterTweetSamples.c.seq), sql.func.max(schema.twitterTweetSamples.c.seq) ]).where(schema.twitterTweetSamples.c.agg_ts >= startingTimestamp) return sqlEngine.execute(sel).first() totalNumItems, minSeq, maxSeq = queryMigrationRange() if totalNumItems == 0: g_log.info("Nothing forwarded: no tweet samples found since %s UTC", startingTimestamp.isoformat()) return g_log.info( "Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq) # Retrieve and publish batches totalNumPublished = 0 totalNumBatches = 0 batchMinSeq = minSeq with MessageBusConnector() as messageBus: while True: batchEndSeq, batch = TweetForwarder.queryNonMetricTweetBatch( sqlEngine=sqlEngine, minSeq=batchMinSeq, maxItems=200, maxSeq=maxSeq) if batchEndSeq is None: break TweetForwarder.publishNonMetricTweetBatch(messageBus=messageBus, batch=batch) totalNumPublished += len(batch) totalNumBatches += 1 g_log.debug( "Published numItems=%d; batchMinSeq=%s; batchEndSeq=%s " "(%d of %d: %s%%)", len(batch), batchMinSeq, batchEndSeq, totalNumPublished, totalNumItems, int(float(totalNumPublished) / totalNumItems * 100)) if (totalNumBatches % 250) == 0 or totalNumPublished == totalNumItems: # Progress report g_log.info("Published %d of %d: %s%%", totalNumPublished, totalNumItems, int(float(totalNumPublished) / totalNumItems * 100)) # Prepare for next query batchMinSeq = batchEndSeq + 1 g_log.info( "Done publishing! publishedBatches=%d, publishedItems=%d, " "expectedItems=%d; minSeq=%s, maxSeq=%s", totalNumBatches, totalNumPublished, totalNumItems, minSeq, maxSeq)