def restart_and_check(ips_per_shard): """ Main restart and verification function after DBs have been restored. Assumes `ips_per_shard` has been verified. Raises a RuntimeError if check fails. """ assert isinstance(ips_per_shard, dict) and len(ips_per_shard.keys()) > 0 _interaction_lock.acquire() try: if interact(f"Restart shards: {sorted(ips_per_shard.keys())}?", ["yes", "no"]) == "no": return finally: _interaction_lock.release() print( f"{Typgpy.HEADER}Restarting all target machines on shards {sorted(ips_per_shard.keys())} and " f"checking for progress... (ETA: ~90 seconds to complete){Typgpy.ENDC}" ) threads = [] post_check_pool = ThreadPool(processes=len(ips_per_shard.keys())) for shard in ips_per_shard.keys(): log.debug( f"starting restart for shard {shard}; ips: {ips_per_shard[shard]}") threads.append( post_check_pool.apply_async(restart_all, (ips_per_shard[shard], ))) for t in threads: t.get() log.debug(f"finished restarting shards {sorted(ips_per_shard.keys())}") sleep_b4_progress_check = 15 log.debug( f"sleeping {sleep_b4_progress_check} seconds before checking if all nodes are making progress" ) time.sleep(sleep_b4_progress_check) threads = [] for shard in ips_per_shard.keys(): log.debug( f"starting node progress verification for shard {shard}; ips: {ips_per_shard[shard]}" ) threads.append( post_check_pool.apply_async(verify_all_progressed, (ips_per_shard[shard], ))) if not all(t.get() for t in threads): raise RuntimeError(f"not all nodes restarted, check logs for details") log.debug("recovery succeeded!")
def cleanup_rclone(ips): """ Cleans up the rclone config setup by this script. WARNING: this will delete the file at `rclone_config_path_on_machine`. """ assert isinstance(ips, list) and len(ips) > 0 ips = ips.copy() # make a copy to not mutate given ips thread_and_ip_list, pool = [], ThreadPool( processes=100) # high process count is OK since threads just wait while True: thread_and_ip_list.clear() log.debug(f"cleaning up rclone on the following ips: {ips}") for ip in ips: el = (pool.apply_async(_cleanup_rclone, (ip, )), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug(f"successfully cleaned up rclone config!") return _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed to cleanup rclone config!{Typgpy.ENDC}" ) ips.clear() for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) ips.append(ip) if interact("Retry on failed nodes?", ["yes", "no"]) == "no": print( f"{Typgpy.WARNING}Could not cleanup some rclone config, but proceeding anyways...{Typgpy.ENDC}" ) log.warning( f"Could not cleanup some rclone config, but proceeding anyways." ) return finally: _interaction_lock.release()
def backup_existing_dbs(ips, shard): """ Simply tar the existing db (locally) if needed in the future """ assert isinstance(ips, list) and len(ips) > 0 assert isinstance(shard, int) ips = ips.copy() # make a copy to not mutate given ips thread_and_ip_list, pool = [], ThreadPool( processes=100) # high process count is OK since threads just wait while True: thread_and_ip_list.clear() log.debug(f"backing up existing DBs on the following ips: {ips}") for ip in ips: el = (pool.apply_async(_backup_existing_dbs, (ip, shard)), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug(f"successfully backed up existing DBs!") return _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed to backup existing DBs!{Typgpy.ENDC}" ) ips.clear() for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) ips.append(ip) if interact("Retry on failed nodes?", ["yes", "no"]) == "no": print( f"{Typgpy.WARNING}Could not backup some existing DBs, but proceeding anyways...{Typgpy.ENDC}" ) log.warning( f"Could not backup some existing DBs, but proceeding anyways." ) return finally: _interaction_lock.release()
def stop_all(ips): """ Send stop command to all nodes asynchronously. """ assert isinstance(ips, list) and len(ips) > 0 ips = ips.copy() # make a copy to not mutate given ips thread_and_ip_list, pool = [], ThreadPool( processes=100) # high process count is OK since threads just wait while True: thread_and_ip_list.clear() log.debug(f"shutting down the following ips: {ips}") for ip in ips: el = (pool.apply_async(_stop, (ip, )), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug(f"successfully stopped all given harmony processes!") return _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed to stop the harmony processes!{Typgpy.ENDC}" ) ips.clear() for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) ips.append(ip) if interact("Retry on failed nodes?", ["yes", "no"]) == "no": print( f"{Typgpy.WARNING}Could not stop harmony process on some nodes, but proceeding anyways...{Typgpy.ENDC}" ) log.warning( f"Could not stop harmony process on some nodes, but proceeding anyways." ) return finally: _interaction_lock.release()
def get_ips_per_shard(logs_dir): """ Setup function to get the IPs per shard given the `logs_dir`. """ assert os.path.isdir(logs_dir) log.debug("Loading IPs from given directory...") ips_per_shard = {} input_choices = [ f"Read IPs from log directory & choose interactively. (Log directory {logs_dir})", "Provide IPs interactively as a CSV string for each shard" ] _interaction_lock.acquire() try: response = interact("How would you like to input network IPs?", input_choices) finally: _interaction_lock.release() if response == input_choices[ 0]: # Read IPs from log directory and choose interactively for file in os.listdir(logs_dir): if not re.match(r"shard[0-9]+.txt", file): continue shard = int(file.replace("shard", "").replace(".txt", "")) if shard in ips_per_shard.keys(): raise RuntimeError(f"Multiple IP files for shard {shard}") shard_ips = _get_ips_per_shard_from_file(f"{logs_dir}/{file}", shard) if shard_ips is not None: ips_per_shard[shard] = shard_ips elif response == input_choices[ 1]: # Provide IPs interactively as a CSV string for each shard log.debug("Reading IPs from console interactively") shard = 0 while True: choices = ["yes", "skip", "finish"] _interaction_lock.acquire() try: action = interact(f"Enter IPs for shard {shard}?", choices) finally: _interaction_lock.release() if action == choices[-1]: # finished break if action == choices[1]: # skip shard += 1 continue if action == choices[0]: # yes while True: ips_as_csv = input( f"Enter IPs as a CSV string for shard {shard}\n> ") log.debug(f"raw csv input for shard {shard}: {ips_as_csv}") shard_ips = [] for ip in map(lambda e: e.strip(), ips_as_csv.split(",")): if not re.search(ipv4_regex, ip): log.debug( f"throwing away '{ip}' as it is not a valid ipv4 address" ) else: shard_ips.append(ip) if shard_ips: _interaction_lock.acquire() try: print( f"\nShard {Typgpy.HEADER}{shard}{Typgpy.ENDC} " f"{Typgpy.UNDERLINE}given & filtered{Typgpy.ENDC} ips ({len(shard_ips)})" ) for i, ip in enumerate(shard_ips): print( f"{i + 1}.\t{Typgpy.OKGREEN}{ip}{Typgpy.ENDC}" ) choices = ["yes", "retry", "no"] response = interact( f"Add above ips for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}?", choices) if response == choices[0]: log.debug(f"shard {shard} IPs: {shard_ips}") ips_per_shard[shard] = shard_ips break if response == choices[1]: continue if response == choices[-1]: break finally: _interaction_lock.release() else: log.debug(f"no valid ips to add to shard {shard}") shard += 1 continue # Final print target IPs print() for shard in sorted(ips_per_shard.keys()): print( f"Shard {Typgpy.HEADER}{shard}{Typgpy.ENDC} {Typgpy.UNDERLINE}target{Typgpy.ENDC} IPs:" ) print('-' * 16) for ip in sorted(ips_per_shard[shard]): print(ip) print() final_report = "Added " for k, v in ips_per_shard.items(): final_report += f"{len(v)} ips for shard {k}; " log.debug(final_report[:-2]) return ips_per_shard
def _get_ips_per_shard_from_file(file_path, shard): """ Internal function to get IPs per shard from a given file interactively. Assumes that `file_path` exists, that its basename follows ^shard[0-9]+.txt, and that the file contains IPs in new line separated format. Returns a list of chosen IPs or None if shard is to be ignored. """ # Load file & verify shard has not loaded IPs file = os.path.basename(file_path) file_shard = int(file.replace("shard", "").replace(".txt", "")) assert file_shard == shard, f"file shard ({file_shard}) for {file_path} != {shard}" with open(file_path, 'r', encoding='utf-8') as f: log.debug(f"Reading IPs from file {file_path}") ips = [ line.strip() for line in f.readlines() if re.search(ipv4_regex, line) ] if not ips: raise RuntimeError(f"no VALID IP was loaded from file: '{file_path}'") log.debug(f"Candidate IPs for shard {shard}: {ips}") # Prompt user with actions to do on read IPs _interaction_lock.acquire() try: print(f"\nShard {Typgpy.HEADER}{shard}{Typgpy.ENDC} ips ({len(ips)}):") for i, ip in enumerate(ips): print(f"{i + 1}.\t{Typgpy.OKGREEN}{ip}{Typgpy.ENDC}") choices = [ "Add all above IPs", "Choose IPs from above to add (interactively)", "Ignore" ] response = interact("", choices) finally: _interaction_lock.release() # Execute action on read IPs if response == choices[-1]: # Ignore log.debug(f"ignoring IPs from shard {shard}") return None if response == choices[0]: # Add all above IPs log.debug(f"shard {shard} IPs: {ips}") return ips if response == choices[1]: # Choose IPs from above to add (interactively) chosen_ips = [] for ip in ips: prompt = f"Add {Typgpy.OKGREEN}{ip}{Typgpy.ENDC} for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}?" _interaction_lock.acquire() try: if interact(prompt, ["yes", "no"]) == "yes": chosen_ips.append(ip) finally: _interaction_lock.release() if not chosen_ips: msg = f"chose 0 IPs for shard {shard}, ignoring shard" log.debug(msg) return None _interaction_lock.acquire() try: print( f"\nShard {Typgpy.HEADER}{shard}{Typgpy.ENDC} " f"{Typgpy.UNDERLINE}chosen{Typgpy.ENDC} ips ({len(chosen_ips)})" ) for i, ip in enumerate(chosen_ips): print(f"{i + 1}.\t{Typgpy.OKGREEN}{ip}{Typgpy.ENDC}") if interact( f"Add above ips for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}?", ["yes", "no"]) == "yes": log.debug(f"shard {shard} IPs: {chosen_ips}") return chosen_ips log.debug(f"ignoring IPs from shard {shard}") return None finally: _interaction_lock.release()
def recover(ips_per_shard, snapshot_per_shard, rclone_config_path): """ Bulk of the work is handled here. Actions done interactively to ensure security. Assumes `ips_per_shard` has been verified. Assumes `snapshot_per_shard` has beacon-chain snapshot path and that each shard's snapshot follow format: <rclone-config>:<bin>. Assumes `rclone_config_path` is a rclone config file. """ assert isinstance(ips_per_shard, dict) assert isinstance(snapshot_per_shard, dict) assert beacon_chain_shard in snapshot_per_shard.keys() assert os.path.isfile(rclone_config_path) _interaction_lock.acquire() try: print() for shard in sorted(ips_per_shard.keys()): print( f"{Typgpy.BOLD}Shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}{Typgpy.BOLD} IPs: {Typgpy.ENDC}" ) for i, ip in enumerate(ips_per_shard[shard]): print(f"{i}.\t{Typgpy.OKGREEN}{ip}{Typgpy.ENDC}") print( f"{Typgpy.BOLD}Shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}{Typgpy.BOLD} snapshot path: {Typgpy.ENDC}" f"{Typgpy.OKGREEN}{snapshot_per_shard[shard]}{Typgpy.ENDC}") print() print(f"{Typgpy.BOLD}Rclone config path (on this machine): " f"{Typgpy.OKGREEN}{rclone_config_path}{Typgpy.ENDC}") if interact("Start recovery?", ["yes", "no"]) == "no": log.warning("Abandoned recovery...") return finally: _interaction_lock.release() def process(shard): ips = ips_per_shard[shard] print( f"{Typgpy.OKBLUE}Stopping all machines for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) stop_all(ips) print( f"{Typgpy.OKGREEN}Successfully stopped all machines for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) print( f"{Typgpy.OKBLUE}Backing up (locally) existing DB for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) backup_existing_dbs(ips, shard) print( f"{Typgpy.OKGREEN}Successfully backed up existing DB for {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) print( f"{Typgpy.OKBLUE}Setting up rclone on all machines for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) setup_rclone(ips, rclone_config_path) print( f"{Typgpy.OKGREEN}Successfully setup rclone on all machines for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) print( f"{Typgpy.OKBLUE}Rsyncing chosen snapshot DBs for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) rsync_snapshotted_dbs(ips, shard, snapshot_per_shard[beacon_chain_shard], snapshot_per_shard[shard]) print( f"{Typgpy.OKGREEN}Successfully rsynced chosen snapshot DBs for shard {Typgpy.HEADER}{shard}{Typgpy.ENDC}" ) threads, pool = [], ThreadPool(len(ips_per_shard.keys())) for shard in ips_per_shard.keys(): threads.append(pool.apply_async(process, (shard, ))) for t in threads: t.get() restart_and_check(ips_per_shard) print( f"{Typgpy.OKGREEN}Successfully restarted all target machines{Typgpy.ENDC}" ) log.debug("finished recovery successfully")
def rsync_snapshotted_dbs(ips, shard, beacon_snapshot_config_bin, shard_snapshot_config_bin): """ Removes the old DB(s) and rsyncs the snapshotted DB(s). Assumption is that nodes have rclone setup with appropriate credentials. Note that rsyncs are idempotent if syncing to same bin, therefore on failure, one can safely re-execute a rsync that was previously successful. Moreover, rsyncs should be efficient, in that it only transfers missing files, therefore rsyncs of previously successful rsync will have little cost (computationally & network-wise). Assumes the `beacon_snapshot_config_bin` & `shard_snapshot_config_bin` matches rclone config setup on machine and follow format: <rclone-config>:<bin>. Raises RuntimeError if unable to rsync snapshotted DBs. """ assert isinstance(ips, list) and len(ips) > 0 assert isinstance(shard, int) assert isinstance(beacon_snapshot_config_bin, str) assert isinstance(shard_snapshot_config_bin, str) ips = ips.copy() # make a copy to not mutate given ips thread_and_ip_list, pool = [], ThreadPool( processes=200) # high process count is OK since threads just wait while True: thread_and_ip_list.clear() log.debug( f"rsyncing snapshotted dbs for shard {shard} on the following ips: {ips}" ) for ip in ips: el = (pool.apply_async( _rsync_snapshotted_dbs, (ip, beacon_chain_shard, beacon_snapshot_config_bin)), ip) thread_and_ip_list.append(el) if shard != beacon_chain_shard: el = (pool.apply_async(_rsync_snapshotted_dbs, (ip, shard, shard_snapshot_config_bin)), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug(f"successfully rsynced snapshotted DB(s)!") return _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed to rsync snapshotted DB(s)!{Typgpy.ENDC}" ) ips.clear() for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) ips.append(ip) if interact("Retry on failed nodes?", ["yes", "no"]) == "no": raise RuntimeError("Could not rsync some snapshotted DB(s)") finally: _interaction_lock.release()
def setup_rclone(ips, rclone_config_path): """ Setup rclone on all `ips` with the config at the given `rclone_config_path`. Assumes `rclone_config_path` is a rclone config file. """ assert isinstance(ips, list) and len(ips) > 0 assert os.path.isfile(rclone_config_path) ips = ips.copy() # make a copy to not mutate given ips # setup/save rclone setup script for ssh cmd with open(rclone_config_path, 'r') as f: rclone_config_raw = f.read() bash_script_content = f"""#!/bin/bash echo "{rclone_config_raw}" > {rclone_config_path_on_machine} && echo successfully installed config """ bash_script_path = f"/tmp/snapshot_recovery_rclone_setup_script_{time.time()}.sh" with open(bash_script_path, 'w') as f: f.write(bash_script_content) try: thread_and_ip_list, pool = [], ThreadPool( processes=100) # high process count is OK since threads just wait while True: thread_and_ip_list.clear() log.debug(f"setting up rclone on the following ips: {ips}") for ip in ips: el = (pool.apply_async( _setup_rclone, (ip, bash_script_path, rclone_config_raw)), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug(f"successfully setup rclone!") return _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed to setup rclone!{Typgpy.ENDC}" ) ips.clear() for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) ips.append(ip) if interact("Retry on failed nodes?", ["yes", "no"]) == "no": print( f"{Typgpy.WARNING}Could not setup rclone on some machines, but proceeding anyways...{Typgpy.ENDC}" ) log.warning( f"Could not setup rclone on some machines, but proceeding anyways." ) return finally: _interaction_lock.release() finally: os.remove(bash_script_path)
def verify_network(ips_per_shard, network): """ Verify that nodes are for the given network. Requires interaction if failure. If nodes are offline, prompt to ignore or reboot nodes and try again. Assumes `ips_per_shard` has valid IPs. """ assert isinstance(ips_per_shard, dict) and len(ips_per_shard.keys()) > 0 assert isinstance(network, str) all_ips = [] for lst in ips_per_shard.values(): all_ips.extend(lst) log.debug(f"verifying network on the following IPs: {all_ips}") thread_and_ip_list, pool = [], ThreadPool( processes=300) # single simple RPC request, pool can be large while True: # Verify nodes thread_and_ip_list.clear() log.debug(f"verifying the following ips: {all_ips}") for ip in all_ips: el = (pool.apply_async(_verify, (ip, network)), ip) thread_and_ip_list.append(el) results = [] for thread, ip in thread_and_ip_list: results.append((thread.get(), ip)) failed_results = [el for el in results if el[0] is not None] if not failed_results: log.debug("passed network verification") return # Prompt user on next course of action _interaction_lock.acquire() try: print( f"{Typgpy.FAIL}Some nodes failed node verification checks!{Typgpy.ENDC}" ) failed_ips = [] for reason, ip in failed_results: print( f"{Typgpy.OKGREEN}{ip}{Typgpy.ENDC} failed because of: {reason}" ) failed_ips.append(ip) choices = ["Reboot nodes and try again", "Ignore"] response = interact("", choices) finally: _interaction_lock.release() # Execute next course of action if response == choices[-1]: # Ignore log.debug("ignoring errors on verify_network") return if response == [0]: # Reboot nodes and try again log.debug("restarting nodes due to failure in verify_network") restart_all(failed_ips) log.debug("sleeping 10 seconds before checking all nodes again...") time.sleep(10) continue
def get_snapshot_per_shard(network, ips_per_shard, snapshot_config_bin): """ Setup function to get the snapshot DB path (used by rclone) for each shard. Assumes the `snapshot_config_bin` follow format: <rclone-config>:<bin>. Assumes that AWS CLI is setup on machine that is running this script. """ assert isinstance(network, str) assert isinstance(ips_per_shard, dict) and len(ips_per_shard.keys()) > 0 assert isinstance(snapshot_config_bin, str) snapshot_per_shard = {} snapshot_bin = f"{snapshot_config_bin.split(':')[1]}/{network}/" shards = list(ips_per_shard.keys()) if beacon_chain_shard not in shards: shards.append(beacon_chain_shard) for shard in sorted(shards): choices = [ "Interactively select snapshot db, starting from latest", "Manually specify path (and optionally bin) for snapshot db" ] _interaction_lock.acquire() try: response = interact(f"How to get snapshot db for shard {shard}?", choices) finally: _interaction_lock.release() if response == choices[ 0]: # Interactively select snapshot db, starting from latest snapshot = select_snapshot_for_shard(network, snapshot_config_bin, shard) if snapshot is None: raise RuntimeError( f"Could not find snapshot for shard {shard}! " f"Check specified snapshot bin or ignore shard when loading IPs." ) snapshot_per_shard[shard] = snapshot continue if response == choices[ 1]: # Manually specify path (and optionally bin) for snapshot db while True: snapshot = input_prefill( f"Enter snapshot path (for rclone) on shard {shard}\n> ", prefill=f"{snapshot_bin}") log.debug(f"chose DB: {snapshot} for shard {shard}") try: aws_s3_ls(snapshot) break except subprocess.CalledProcessError as e: error_msg = f"Machine is unable to list s3 files at '{snapshot}'. Error: {e}" print(error_msg) log.error(traceback.format_exc()) log.error(error_msg) _interaction_lock.acquire() try: if interact(f"Retry?", ["yes", "no"]) == "no": raise RuntimeError(error_msg) from e finally: _interaction_lock.release() if snapshot.endswith("/"): snapshot = snapshot[:-1] snapshot_per_shard[shard] = snapshot continue return snapshot_per_shard
def select_snapshot_for_shard(network, snapshot_config_bin, shard): """ Interactively select the snapshot to ensure security. Assumes the `snapshot_config_bin` follow format: <rclone-config>:<bin>. Assumes that AWS CLI is setup on machine that is running this script. Assumes AWS s3 structure is: <bin>/<network>/<db-type>/<shard-id>/harmony_db_<shard-id>.<date>.<block_height>/ Returns string of db bin for snapshot rclone following format: <rclone-config>:<bin>. Return None if no db bin could be selected. """ assert isinstance(network, str) assert isinstance(snapshot_config_bin, str) assert isinstance(shard, int) def filter_db(entry): try: # Return block height return int(entry.split('.')[-1]) except (ValueError, KeyError): return -1 # Get to desired bucket of snapshot DBs rclone_config, snapshot_bin = snapshot_config_bin.split(':') log.debug(f"fetching snapshot DB path from bin '{snapshot_bin}'") snapshot_bin = f"{snapshot_bin}/{network}/" db_types = aws_s3_ls(snapshot_bin) if not db_types: return None _interaction_lock.acquire() try: selected_db_type = interact("Select recovery DB type", db_types) finally: _interaction_lock.release() log.debug(f"selected {selected_db_type} db type") snapshot_bin += f"{selected_db_type}/" shards = [int(s) for s in aws_s3_ls(snapshot_bin)] if shard not in shards: raise RuntimeError(f"snapshot db not found for shard {shard}") snapshot_bin += f"{shard}/" dbs = sorted(filter(lambda e: filter_db(e) >= 0, aws_s3_ls(snapshot_bin)), key=filter_db, reverse=True) if not dbs: return None # Request db presented_dbs_count = 10 while True: prompt_db = dbs.copy()[:presented_dbs_count] + ["Look for more DBs"] prompt = f"Select DB for shard {shard}. Format: harmony_db_SHARD.Y-M-D-H-M-S.BLOCK)" _interaction_lock.acquire() try: response = interact(prompt, prompt_db, sort=False) finally: _interaction_lock.release() if response == prompt_db[-1]: presented_dbs_count *= 2 continue else: rclone_snapshot_db_path = f"{rclone_config}:{snapshot_bin}{response}" log.debug( f"chosen snapshot rclone path: '{rclone_snapshot_db_path}' for shard {shard}" ) return rclone_snapshot_db_path