def post_processing(mongo, batch_size=100, max_workers=50): indexer = Indexer(mongo) start_block = indexer.get_checkpoint('post_processing') query = { "block_num": { "$gt": start_block, "$lte": start_block + batch_size, } } projection = { '_id': 0, 'body': 0, 'json_metadata': 0, } results = list(mongo.Operations.find(query, projection=projection)) batches = map(parse_operation, results) # handle an edge case when we are too close to the head, # and the batch contains no work to do if not results and is_recent(start_block, days=1): return # squash for duplicates def custom_merge(*args): return list(set(keep(flatten(args)))) batch_items = merge_with(custom_merge, *batches) # only process accounts if the blocks are recent # scrape_all_users should take care of stale updates if is_recent(start_block, days=10): accounts = set(batch_items['accounts_light'] + batch_items['accounts']) list(thread_multi( fn=update_account, fn_args=[mongo, None], dep_args=list(accounts), fn_kwargs=dict(load_extras=False), max_workers=max_workers, re_raise_errors=False, )) list(thread_multi( fn=update_account_ops_quick, fn_args=[mongo, None], dep_args=list(accounts), fn_kwargs=None, max_workers=max_workers, re_raise_errors=False, )) index = max(lpluck('block_num', results)) indexer.set_checkpoint('post_processing', index) log.info("Checkpoint: %s - %s accounts (+%s full)" % ( index, len(batch_items['accounts_light']), len(batch_items['accounts']), ))
def batch_update_async(batch_items: dict): # todo break this batch into posts and account updates # if we're lagging by a large margin, don't bother updating accounts lag = time_delta(find_latest_item(mongo, 'Posts', 'created')) if lag > 1000: return if use_multi_threading: with log_exceptions(): thread_multi( fn=update_account, fn_args=[mongo, None], dep_args=batch_items['accounts_light'], fn_kwargs=dict(load_extras=False), max_workers=num_threads, ) thread_multi( fn=update_account_ops_quick, fn_args=[mongo, None], dep_args=batch_items['accounts_light'], fn_kwargs=None, max_workers=num_threads, ) else: for account_name in batch_items['accounts_light']: with log_exceptions(): update_account(mongo, account_name, load_extras=False) update_account_ops_quick(mongo, account_name) if use_multi_threading: with log_exceptions(): thread_multi( fn=update_account, fn_args=[mongo, None], dep_args=batch_items['accounts'], fn_kwargs=dict(load_extras=True), max_workers=num_threads, ) thread_multi( fn=update_account_ops_quick, fn_args=[mongo, None], dep_args=batch_items['accounts'], fn_kwargs=None, max_workers=num_threads, ) else: for account_name in batch_items['accounts']: with log_exceptions(): update_account(mongo, account_name, load_extras=True) update_account_ops_quick(mongo, account_name)
def scrape_comments(mongo, batch_size=250, max_workers=50): """ Parse operations and post-process for comment/post extraction. """ indexer = Indexer(mongo) start_block = indexer.get_checkpoint('comments') query = { "type": "comment", "block_num": { "$gt": start_block, "$lte": start_block + batch_size, } } projection = { '_id': 0, 'block_num': 1, 'author': 1, 'permlink': 1, } results = list(mongo.Operations.find(query, projection=projection)) identifiers = set(f"{x['author']}/{x['permlink']}" for x in results) # handle an edge case when we are too close to the head, # and the batch contains no work to do if not results and is_recent(start_block, days=1): return # get Post.export() results in parallel raw_comments = thread_multi(fn=get_comment, fn_args=[None], dep_args=list(identifiers), max_workers=max_workers, yield_results=True) raw_comments = lkeep(raw_comments) # split into root posts and comments posts = lfilter(lambda x: x['depth'] == 0, raw_comments) comments = lfilter(lambda x: x['depth'] > 0, raw_comments) # Mongo upsert many log_output = '' if posts: r = mongo.Posts.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in posts ], ordered=False, ) log_output += \ f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) ' if comments: r = mongo.Comments.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in comments ], ordered=False, ) log_output += \ f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) ' # We are only querying {type: 'comment'} blocks and sometimes # the gaps are larger than the batch_size. index = silent(max)(lpluck('block_num', results)) or (start_block + batch_size) indexer.set_checkpoint('comments', index) log.info(f'Checkpoint: {index} {log_output}')