def fix_missing_files(): """ Finds replays set as "ARCHIVED" that are missing a corresponding file stored in S3. Re-adds them to GC queue. """ _error = "MISSING_S3_FILE" all_s3_replay_ids = [ replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" ] archived_replays_no_file = Replay.query.filter( Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() for replay in archived_replays_no_file: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print( "Replay {} is \"ARCHIVED\" but does not have a file stored on S3. Re-adding to GC queue." .format(replay.id)) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)
def small_replay_exodus(self): small_replay_files = { replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024) } small_replays = Replay.query.filter( Replay.id.in_(small_replay_files.keys())).all() replays_removed = [] # IDs of removed replays for replay in small_replays: # Save local URI so we can remove the file from S3 after we've changed the databose. local_uri = replay.local_uri # Clean up metadata associated with an archived replay. replay.dl_done_time = None replay.local_uri = None replay.state = "WAITING_DOWNLOAD" # Save ne state to database db.session.add(replay) db.session.commit() # Remove bad file from S3. dotabank_bucket.delete_key( local_uri or "replays/{}.dem.bz2".format(replay.id)) # Add a new download job Replay.add_dl_job(replay) # Note that we've done things to this replay. replays_removed.append(replay.id) return jsonify(success=True, replays_removed=replays_removed)
def fix_small_replays(): """ Finds replays with a tiny filesize and re-adds them to the GC queue (we probably downloaded a error page. """ _error = "SMALL_REPLAY" # FIXME: This step will take longer and longer the more replays we store. It would be more efficient to store # the filesize in our local database after a file has been archived, and then directly query the database. small_replay_files = {replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024)} small_replays = db.session.query(Replay, db.func.count(ReplayAutoFix.id)).filter( Replay.state == "ARCHIVED", # Ignore non-archived files (they shouldnt be in s3 if they aren't archived, but vOv) Replay.id.in_(small_replay_files.keys()), # Check the replays that the S3 call above has flagged as small ReplayAutoFix.replay_id == Replay.id ).group_by( ReplayAutoFix.replay_id ).having( db.func.count(ReplayAutoFix.id) < app.config.get('MAX_REPLAY_FIX_ATTEMPTS') # Ignore replays that have exceeded max fix attempts ).all() for replay, fix_attempts in small_replays: if not should_fix_be_attempted(replay.id, _error, extra={ 'file_size': small_replay_files[unicode(replay.id)] }): continue print ("Replay {} has a small file stored on s3 ({} bytes). Re-adding to DL queue.".format( replay.id, small_replay_files[unicode(replay.id)] )) replay.state = "WAITING_GC" # Switch state back to WAITING_GC. Replay.add_dl_job(replay)
def index(self): """ Renders a list of replays which are atypical. human_players_discrepancy: Replays where their human_player property doesn't match the count of ReplayPlayer objects we have in our database. replay_available_download_error: Replays which are available to download, but that our download script failed to retrieve. replay_waiting_download_over24hrs: Replays which have been waiting to be downloaded for over 24 hrs. """ human_players_discrepancy = [x for x in db.engine.execute( text(""" SELECT r.id, r.human_players, count(*) as player_count FROM {replay_table} r LEFT JOIN {player_table} rp ON rp.replay_id = r.id WHERE rp.account_id is not NULL # Exclude bots from count (though there's the chance we have duplicate entries for bots? fack) GROUP BY rp.replay_id """.format( replay_table=Replay.__tablename__, player_table=ReplayPlayer.__tablename__) ) ) if x.player_count != x.human_players] replay_available_download_error = Replay.query.filter( Replay.replay_state == "REPLAY_AVAILABLE", Replay.state == "DOWNLOAD_ERROR" ).all() replay_waiting_download_over24hrs = Replay.query.filter( Replay.state == "WAITING_DOWNLOAD", Replay.gc_done_time <= (datetime.utcnow() - timedelta(hours=24)) # Over 24 hrs ago ).all() small_replay_files = {replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024)} small_replays = Replay.query.filter(Replay.id.in_(small_replay_files.keys())).all() all_s3_replay_ids = [replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/"] archived_replays_no_file = Replay.query.filter(Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() return self.render( 'admin/atypical_replays.html', human_players_discrepancy=human_players_discrepancy, replay_available_download_error=replay_available_download_error, replay_waiting_download_over24hrs=replay_waiting_download_over24hrs, small_replays=small_replays, small_replay_files=small_replay_files, archived_replays_no_file=archived_replays_no_file )
def fix_small_replays(): """ Finds replays with a tiny filesize and re-adds them to the GC queue (we probably downloaded a error page. """ _error = "SMALL_REPLAY" # FIXME: This step will take longer and longer the more replays we store. It would be more efficient to store # the filesize in our local database after a file has been archived, and then directly query the database. small_replay_files = { replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024) } small_replays = db.session.query( Replay, db.func.count(ReplayAutoFix.id) ).filter( Replay.state == "ARCHIVED", # Ignore non-archived files (they shouldnt be in s3 if they aren't archived, but vOv) Replay.id.in_(small_replay_files.keys( )), # Check the replays that the S3 call above has flagged as small ReplayAutoFix.replay_id == Replay.id).group_by( ReplayAutoFix.replay_id).having( db.func.count(ReplayAutoFix.id) < app.config.get( 'MAX_REPLAY_FIX_ATTEMPTS' ) # Ignore replays that have exceeded max fix attempts ).all() for replay, fix_attempts in small_replays: if not should_fix_be_attempted( replay.id, _error, extra={'file_size': small_replay_files[unicode(replay.id)]}): continue print( "Replay {} has a small file stored on s3 ({} bytes). Re-adding to DL queue." .format(replay.id, small_replay_files[unicode(replay.id)])) replay.state = "WAITING_GC" # Switch state back to WAITING_GC. Replay.add_dl_job(replay)
def fix_missing_files(): """ Finds replays set as "ARCHIVED" that are missing a corresponding file stored in S3. Re-adds them to GC queue. """ _error = "MISSING_S3_FILE" all_s3_replay_ids = [replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/"] archived_replays_no_file = Replay.query.filter(Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() for replay in archived_replays_no_file: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print ("Replay {} is \"ARCHIVED\" but does not have a file stored on S3. Re-adding to GC queue.".format( replay.id )) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)
def small_replay_exodus(self): small_replay_files = {replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024)} small_replays = Replay.query.filter(Replay.id.in_(small_replay_files.keys())).all() replays_removed = [] # IDs of removed replays for replay in small_replays: # Save local URI so we can remove the file from S3 after we've changed the databose. local_uri = replay.local_uri # Clean up metadata associated with an archived replay. replay.dl_done_time = None replay.local_uri = None replay.state = "WAITING_DOWNLOAD" # Save ne state to database db.session.add(replay) db.session.commit() # Remove bad file from S3. dotabank_bucket.delete_key(local_uri or "replays/{}.dem.bz2".format(replay.id)) # Add a new download job Replay.add_dl_job(replay) # Note that we've done things to this replay. replays_removed.append(replay.id) return jsonify( success=True, replays_removed=replays_removed )
def index(self): """ Renders a list of replays which are atypical. human_players_discrepancy: Replays where their human_player property doesn't match the count of ReplayPlayer objects we have in our database. replay_available_download_error: Replays which are available to download, but that our download script failed to retrieve. replay_waiting_download_over24hrs: Replays which have been waiting to be downloaded for over 24 hrs. """ human_players_discrepancy = [ x for x in db.engine.execute( text(""" SELECT r.id, r.human_players, count(rp.id) as player_count FROM {replay_table} r LEFT JOIN {player_table} rp ON rp.replay_id = r.id WHERE rp.id is NULL or rp.account_id is not NULL # Exclude bots from count (though there's the chance we have duplicate entries for bots? fack) GROUP BY r.id """.format(replay_table=Replay.__tablename__, player_table=ReplayPlayer.__tablename__))) if x.player_count != x.human_players ] replay_available_download_error = Replay.query.filter( Replay.replay_state == "REPLAY_AVAILABLE", Replay.state == "DOWNLOAD_ERROR").all() replay_waiting_download_over24hrs = Replay.query.filter( Replay.state == "WAITING_DOWNLOAD", Replay.gc_done_time <= (datetime.utcnow() - timedelta(hours=24)) # Over 24 hrs ago ).all() small_replay_files = { replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024) } small_replays = Replay.query.filter( Replay.id.in_(small_replay_files.keys())).all() all_s3_replay_ids = [ replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" ] archived_replays_no_file = Replay.query.filter( Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() return self.render( 'admin/atypical_replays.html', human_players_discrepancy=human_players_discrepancy, replay_available_download_error=replay_available_download_error, replay_waiting_download_over24hrs=replay_waiting_download_over24hrs, small_replays=small_replays, small_replay_files=small_replay_files, archived_replays_no_file=archived_replays_no_file)