def threadable_remote_cleanup_all(remote_machines): """ <Purpose> Calls remote_runcleanup for each machine in remote_machines. <Arguments> remote_machines: list of tuples with (user, ip) IPs that we have to cleanup. <Exceptions> None. <Side Effects> None. <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Cleanup/Setup', "Attempting to ssh and run cleaning scripts on "+\ machine) # Run the remote cleanup script deploy_network.remote_runcleanup(username, str(machine)) deploy_logging.log('Cleanup/Setup', " ssh and run cleanup scripts done on "+machine+\ ". Moving on.")
def threadable_remote_run_all(remote_machines): """ <Purpose> This function connects to the remote computer and executes the actual test scripts. This function is run_func threadable <Arguments> remote_machines: list of tuples with (user, ip) that we'll run our tests on. <Exceptions> None. <Side Effects> None. <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # For every single machine we're assigned... for machine_tuple in remote_machines: # Run the files remotely username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Info', "Attempting to ssh and run scripts on "+machine+"...") deploy_network.remote_runscript(username, str(machine), custom_script_name) deploy_logging.log('Info', "Running scripts on "+str(machine)+" completed. Moving on.")
def threadable_remote_cleanup_all(remote_machines): """ <Purpose> Calls remote_runcleanup for each machine in remote_machines. <Arguments> remote_machines: list of tuples with (user, ip) IPs that we have to cleanup. <Exceptions> None. <Side Effects> None. <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Cleanup/Setup', "Attempting to ssh and run cleaning scripts on "+\ machine) # Run the remote cleanup script deploy_network.remote_runcleanup(username, str(machine)) deploy_logging.log('Cleanup/Setup', " ssh and run cleanup scripts done on "+machine+\ ". Moving on.")
def threadable_remote_upload_tar(remote_machines): """ <Purpose> Uploads the deploy.tar to each machine before running anything. Machines that timeout are added to the unreachable_hosts list in the dictionary. <Arguments> remote_machines: list of tuples with (user, ip) IPs that we have to cleanup. <Exceptions> None. <Side Effects> Temporarily locks thread_communications dict which is used by other threads trying to upload (if they run into an error). <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: # split up the tuple username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Setup', 'Attemping tar file upload via scp on ' + machine) scp_errcode, scp_stdout, scp_stderr = upload_tar( username, str(machine)) out, err = deploy_logging.format_stdout_and_err(scp_stdout, scp_stderr) # check the error codes if str(scp_errcode) == '0': deploy_logging.log('Setup', ' scp file upload complete on ' + machine) elif str(scp_errcode) == '1': deploy_logging.logerror('Could not establish a connection with ' + machine + ' (' + err + ')') deploy_threading.add_unreachable_host((username, machine)) else: deploy_logging.logerror('scp returned unknown error code ' + str(scp_errcode) + ' (' + err + ')') deploy_threading.add_unreachable_host((username, machine))
def threadable_cleanup_final(remote_machines): """ <Purpose> Cleans the files created by this script from the remote machine <Arguments> remote_machines: a list containing a single tuple of (user, remotehost) <Exceptions> None. <Side Effects> <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Final cleanup', 'Final cleanup of ' + machine) # build up our list of files/folders we need to delete cmd_list = [] cmd_list.append('rm -rf runlocaltests.py') cmd_list.append('rm -rf hashes.dict') cmd_list.append('rm -rf ' + machine + '.deployrun.log') cmd_list.append('rm -rf ' + machine + '.deployrun.err.log') cmd_list.append('rm -rf testprocess.py') cmd_list.append('rm -rf verifyfiles.mix') cmd_list.append('rm -rf ' + machine + '.tgz') cmd_list.append('rm -rf deploy.tar') cmd_list.append('rm -rf cleanup_deploy.py') #TODO: cleanup custom scripts as well here? # merge the commands into a string that we'll execute cmd_str = '; '.join(cmd_list) ssh_stdout, ssh_stderr, ssh_errcode = deploy_network.remote_shellexec( cmd_str, username, str(machine)) deploy_logging.print_to_log('Detailed cleanup', ssh_stdout, ssh_stderr, ssh_errcode) return
def threadable_cleanup_final(remote_machines): """ <Purpose> Cleans the files created by this script from the remote machine <Arguments> remote_machines: a list containing a single tuple of (user, remotehost) <Exceptions> None. <Side Effects> <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Final cleanup', 'Final cleanup of '+machine) # build up our list of files/folders we need to delete cmd_list = [] cmd_list.append('rm -rf runlocaltests.py') cmd_list.append('rm -rf hashes.dict') cmd_list.append('rm -rf '+machine+'.deployrun.log') cmd_list.append('rm -rf '+machine+'.deployrun.err.log') cmd_list.append('rm -rf testprocess.py') cmd_list.append('rm -rf verifyfiles.mix') cmd_list.append('rm -rf '+machine+'.tgz') cmd_list.append('rm -rf deploy.tar') cmd_list.append('rm -rf cleanup_deploy.py') #TODO: cleanup custom scripts as well here? # merge the commands into a string that we'll execute cmd_str = '; '.join(cmd_list) ssh_stdout, ssh_stderr, ssh_errcode = deploy_network.remote_shellexec(cmd_str, username, str(machine)) deploy_logging.print_to_log('Detailed cleanup', ssh_stdout, ssh_stderr, ssh_errcode) return
def get_remote_hosts_from_file(fname=custom_host_file, nolog=False): """ <Purpose> Returns a list of the IP as read from file specified. File format is: !user:[username] [IPs] [username] is the username that will be used until a new $username is specified in the same format. NOTE: Username is case sensitive. [IPs] are a list of IPs/hostname (one per line) associated with that username <Arguments> fname: Optional. Default is 'iplist.list'. The filename containing the IPs of the remote machines. File must be in the same directory as this script. nolog: Optional. Default is False. If set to true, then nothing will be written to the logfile. <Exceptions> Catches a thrown exception if the IP file is not found. <Side Effects> None. <Returns> Returns a list of tuples with (username, ip) on success, False on failure """ global custom_host_file fname = custom_host_file # IP file must be in the same dir as this script try: file_of_ips = open(fname, 'r') except Exception, e: deploy_logging.log( 'Error', 'Are you missing your list of remote hosts? (' + str(e) + ')') try: file_of_ips.close() except Exception, e: # not sure if we really opened it pass
def sleep_on_conn_refused(out, err, timesleft, remote_host): """ <Purpose> passed in stdout/stderr from ssh/scp, it checks if we had a refused connection, and then returns true if we must retry it or not. Divides 60 seconds by how many times we have left to sleep. So if we retry 3 times... 1st run: sleep 60/3 (20s) 2nd run: sleep 60/2 (30s) 3rd run: sleep 60/1 (60s) As you can see, the timeout increases. <Arguments> out: the stdout err: the stderr timesleft: how many times do we have left to try and connect. <Exceptions> None. <Side Effects> None. <Returns> Boolean. True if we did a sleep, False if we didn't. """ # checks if out/err have 'connection refused' string and waits to # overcome timeout out_bool = out.lower().find('connection refused') > -1 err_bool = err.lower().find('connection refused') > -1 instructional_machine = '128.' in remote_host if instructional_machine: if out_bool or err_bool: # sleep then try again deploy_logging.log('WARNING', "Connection refused, forced sleeping to overcome "+\ "timeout ("+str(timesleft)+" timeouts left)") time.sleep(60/timesleft) # each time you sleep a little longer return True return False
def threadable_remote_upload_tar(remote_machines): """ <Purpose> Uploads the deploy.tar to each machine before running anything. Machines that timeout are added to the unreachable_hosts list in the dictionary. <Arguments> remote_machines: list of tuples with (user, ip) IPs that we have to cleanup. <Exceptions> None. <Side Effects> Temporarily locks thread_communications dict which is used by other threads trying to upload (if they run into an error). <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # for every machine in our list... for machine_tuple in remote_machines: # split up the tuple username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log('Setup', 'Attemping tar file upload via scp on '+machine) scp_errcode, scp_stdout, scp_stderr = upload_tar(username, str(machine)) out, err = deploy_logging.format_stdout_and_err(scp_stdout, scp_stderr) # check the error codes if str(scp_errcode) == '0': deploy_logging.log('Setup', ' scp file upload complete on '+machine) elif str(scp_errcode) == '1': deploy_logging.logerror('Could not establish a connection with '+machine+' ('+err+')') deploy_threading.add_unreachable_host((username, machine)) else: deploy_logging.logerror('scp returned unknown error code '+str(scp_errcode)+' ('+err+')') deploy_threading.add_unreachable_host((username, machine))
def get_remote_hosts_from_file(fname = custom_host_file, nolog = False): """ <Purpose> Returns a list of the IP as read from file specified. File format is: !user:[username] [IPs] [username] is the username that will be used until a new $username is specified in the same format. NOTE: Username is case sensitive. [IPs] are a list of IPs/hostname (one per line) associated with that username <Arguments> fname: Optional. Default is 'iplist.list'. The filename containing the IPs of the remote machines. File must be in the same directory as this script. nolog: Optional. Default is False. If set to true, then nothing will be written to the logfile. <Exceptions> Catches a thrown exception if the IP file is not found. <Side Effects> None. <Returns> Returns a list of tuples with (username, ip) on success, False on failure """ global custom_host_file fname = custom_host_file # IP file must be in the same dir as this script try: file_of_ips = open(fname, 'r') except Exception, e: deploy_logging.log('Error', 'Are you missing your list of remote hosts? ('+str(e)+')') try: file_of_ips.close() except Exception, e: # not sure if we really opened it pass
def upload_tar(user, remote_host, tar_filename="deploy.tar"): """ <Purpose> This function will upload the tar to the remote_host via scp by logging in as user@remote_host, and log the return code as well as anything printed to stderr or stdout (which is expected to be empty). Uses remote_upload_file to upload the actual file. <Arguments> user: the user to log in as on the remote machine. remote_host: the remote machine's IP to which we'll be uploading files. tar_filename: Optional. Default is deploy.tar. The tar file to upload to the remote host. <Exceptions> None. <Side Effects> None. <Returns> A tuple: (returncode, stdout, stderr) """ # call helper to scp stdoutdata, stderrdata, returncode = deploy_network.remote_upload_file( tar_filename, user, remote_host) # check the return code.. if returncode == 0: deploy_logging.log(remote_host, 'Successfully uploaded deploy.tar') else: deploy_logging.logerror(remote_host + ': Trouble uploading deploy.tar') return (str(returncode), stdoutdata, stderrdata)
def upload_tar(user, remote_host, tar_filename = "deploy.tar"): """ <Purpose> This function will upload the tar to the remote_host via scp by logging in as user@remote_host, and log the return code as well as anything printed to stderr or stdout (which is expected to be empty). Uses remote_upload_file to upload the actual file. <Arguments> user: the user to log in as on the remote machine. remote_host: the remote machine's IP to which we'll be uploading files. tar_filename: Optional. Default is deploy.tar. The tar file to upload to the remote host. <Exceptions> None. <Side Effects> None. <Returns> A tuple: (returncode, stdout, stderr) """ # call helper to scp stdoutdata, stderrdata, returncode = deploy_network.remote_upload_file(tar_filename, user, remote_host) # check the return code.. if returncode == 0: deploy_logging.log(remote_host, 'Successfully uploaded deploy.tar') else: deploy_logging.logerror(remote_host+': Trouble uploading deploy.tar') return (str(returncode), stdoutdata, stderrdata)
def threadable_remote_run_all(remote_machines): """ <Purpose> This function connects to the remote computer and executes the actual test scripts. This function is run_func threadable <Arguments> remote_machines: list of tuples with (user, ip) that we'll run our tests on. <Exceptions> None. <Side Effects> None. <Returns> None. """ # Assume single element if it's not a list if type(remote_machines) != type([]): remote_machines = [remote_machines] # For every single machine we're assigned... for machine_tuple in remote_machines: # Run the files remotely username = machine_tuple[0] machine = machine_tuple[1] deploy_logging.log( 'Info', "Attempting to ssh and run scripts on " + machine + "...") deploy_network.remote_runscript(username, str(machine), custom_script_name) deploy_logging.log( 'Info', "Running scripts on " + str(machine) + " completed. Moving on.")
def prep_local_dirs(keep): """ <Purpose> Just prepares local directories - cleans the old log folders if needed or moves them around, and creates a temp folder that'll be used later <Arguments> keep: Boolean. Do we keep the old log directory? <Exceptions> None. <Side Effects> None. <Returns> None. """ # delete the directory if it exists if not keep: if os.path.isdir("./deploy.logs/"): shellexec2('rm -rf ./deploy.logs/') else: # move old log dir if one exists if os.path.isdir("./deploy.logs/"): # find an 'integer'-suffixed directory that hasn't been taken yet dirindex = 1 print 'Trying to move old log directory...' while os.path.isdir("./deploy.logs." + str(dirindex) + "/") or os.path.isfile('deploy.logs.' + str(dirindex) + '.tgz'): time.sleep(.2) dirindex += 1 # got a folder index that doesn't exist shellexec2('mv ./deploy.logs/ ./deploy.logs.' + str(dirindex)) print 'Moved old log directory successfully to ./deploy.logs.' + str( dirindex) print 'Tarring the directory...' shellexec2('tar -czf deploy.logs.' + str(dirindex) + '.tgz deploy.logs.' + str(dirindex)) print 'Tar created, removing uncompressed files...' shellexec2('rm -rf deploy.logs.' + str(dirindex)) # set up logs directory if one doesn't exist if not os.path.isdir("./deploy.logs/"): os.mkdir("./deploy.logs/") deploy_logging.log('Info', "Setting up logs directory..") else: deploy_logging.log('Info', "Logs directory found..") deploy_logging.log('', deploy_logging.sep) if not os.path.isdir('./deploy.logs/temp'): os.mkdir('./deploy.logs/temp/') return
def prep_local_dirs(keep): """ <Purpose> Just prepares local directories - cleans the old log folders if needed or moves them around, and creates a temp folder that'll be used later <Arguments> keep: Boolean. Do we keep the old log directory? <Exceptions> None. <Side Effects> None. <Returns> None. """ # delete the directory if it exists if not keep: if os.path.isdir("./deploy.logs/"): shellexec2('rm -rf ./deploy.logs/') else: # move old log dir if one exists if os.path.isdir("./deploy.logs/"): # find an 'integer'-suffixed directory that hasn't been taken yet dirindex = 1 print 'Trying to move old log directory...' while os.path.isdir("./deploy.logs."+str(dirindex)+"/") or os.path.isfile('deploy.logs.'+str(dirindex)+'.tgz'): time.sleep(.2) dirindex += 1 # got a folder index that doesn't exist shellexec2('mv ./deploy.logs/ ./deploy.logs.'+str(dirindex)) print 'Moved old log directory successfully to ./deploy.logs.'+str(dirindex) print 'Tarring the directory...' shellexec2('tar -czf deploy.logs.'+str(dirindex)+'.tgz deploy.logs.'+str(dirindex)) print 'Tar created, removing uncompressed files...' shellexec2('rm -rf deploy.logs.'+str(dirindex)) # set up logs directory if one doesn't exist if not os.path.isdir("./deploy.logs/"): os.mkdir("./deploy.logs/") deploy_logging.log('Info', "Setting up logs directory..") else: deploy_logging.log('Info', "Logs directory found..") deploy_logging.log('', deploy_logging.sep) if not os.path.isdir('./deploy.logs/temp'): os.mkdir('./deploy.logs/temp/') return
# call the deploy script that'll pick up from here.. deploy() print deploy_logging.sep # cleanup the local temp directory shellexec2('rm -rf ./deploy.logs/temp/') print 'Compacting...' # summarize the logfile before building the summary shellexec2('python log_maintenance.py dummyarg') print 'Building summary logfile..' deploy_logging.build_summary() print deploy_logging.sep deploy_logging.log('Finished', 'All finished.') # returns 1 if there was an error. elif returncode == 1: print 'Error in creating tar.. Aborting' else: # just so we catch all the conditions.. print 'CRITICAL ERROR! script returned with unexpected retcode ('+\ str(returncode)+')' def upload_tar(user, remote_host, tar_filename="deploy.tar"): """ <Purpose> This function will upload the tar to the remote_host via scp by logging in as user@remote_host, and log the return code as well as anything printed to stderr or stdout (which is expected to be empty).
def deploy(): """ <Purpose> This function is the brains behind the deploy script. All the main calls originate from this function. -Gets list of remote hosts from a file -Calls function to execute cleanup/setup on remote hosts before we can run remote scripts and then that same function executes the remote script files <Arguments> None. <Exceptions> Exit if hostlist file was not found. <Side Effects> None. <Returns> None. """ # Get list of hosts myhosts = get_remote_hosts_from_file() if not myhosts: # if we didn't find any hosts.. crap out! print "Didn't find any remote hosts file!" deploy_logging.logerror("Didn't find any remote hosts file!") # return if we don't have instructional machines to process if 'machine_list' not in deploy_threading.thread_communications.keys(): return else: # check if we also have intructional machines, and if we do, then # make sure we're not being tricked - remove all instructional machines # from the myhosts list if 'machine_list' in deploy_threading.thread_communications.keys(): # we have instructional machines machine_list = deploy_threading.thread_communications['machine_list'] myhosts = list(set(myhosts)-set(machine_list)) # initialize thread_communications dictionary to a list which will have # our unreachable hosts deploy_threading.thread_communications['unreachable_host'] = [] # this will keep track of the proc id's that are launched on different # threads. These are ssh/scp processes. We keep track of these because # we want to make sure that when we exit deploy.py, we kill all of these # processes - they should be killed by that time unless there was some kind # of error. deploy_threading.thread_communications['running_process_ids'] = [] # initial run connect_and_do_work(myhosts) # now do the same for the instructional machines if we have any: if 'machine_list' in deploy_threading.thread_communications.keys(): connect_and_do_work(deploy_threading.thread_communications['machine_list'], 3) # if we had unreachable hosts.. if deploy_threading.has_unreachable_hosts(): # Currently, set NOT to retry hosts. Since it's running regularly as a service, # there is no need as 99% of these hosts time out anyway, so it just takes # a lot longer than it should. for i in range(0): # increase timeout time by 25% each time deploy_network.default_connection_timeout =\ str(int(float(deploy_network.default_connection_timeout) * 1.25)) # 1. use list of unreachable hosts list as our list to retry last_failed_hosts = deploy_threading.thread_communications['unreachable_host'] # 2. reset the unreachable hosts list deploy_threading.thread_communications['unreachable_host'] = [] deploy_logging.log("Notice", "Trying to connect to failed hosts (connection attempt #"+str(i+2)+")") connect_and_do_work(last_failed_hosts) print "Checking that all child threads/processes are dead..." for each_tuple in deploy_threading.thread_communications['running_process_ids']: try: # tuple is (pid, expiretime, remotehost, username) procid = int(each_tuple[0]) os.kill(procid, 9) except OSError, ose: pass except Exception, e: print "Something went wrong while trying to kill process "+\ str(procid)+", "+str(e)
# call the deploy script that'll pick up from here.. deploy() print deploy_logging.sep # cleanup the local temp directory shellexec2('rm -rf ./deploy.logs/temp/') print 'Compacting...' # summarize the logfile before building the summary shellexec2('python log_maintenance.py dummyarg') print 'Building summary logfile..' deploy_logging.build_summary() print deploy_logging.sep deploy_logging.log('Finished', 'All finished.') # returns 1 if there was an error. elif returncode == 1: print 'Error in creating tar.. Aborting' else: # just so we catch all the conditions.. print 'CRITICAL ERROR! script returned with unexpected retcode ('+\ str(returncode)+')' def upload_tar(user, remote_host, tar_filename = "deploy.tar"): """ <Purpose> This function will upload the tar to the remote_host via scp by logging in as user@remote_host, and log the return code as well as anything
def remote_get_log(user, remote_host): """ <Purpose> Gets the remote logs (all tarred up) from remote_host and copies it to a local directory via scp then untars it into deploy.logs/[remote_host]/. <Arguments> user: the user to log in as remote_host: the IP of the host to get the logs from <Exceptions> scp fails/times out. <Side Effects> None. <Returns> No returns. """ try: # set up dir that we'll move the remote .tar into if not os.path.isdir('./deploy.logs/'+remote_host): os.mkdir('./deploy.logs/'+remote_host) # download the tar file from remote host out, err, returncode = remote_download_file(remote_host+'.tgz', './deploy.logs/'+remote_host+'/'+remote_host+'.tgz', user, remote_host) deploy_logging.log('Downloading logs', 'Logs downloaded from '+remote_host) # now try to untar the files # build up a command list to execute command_list = [] # tar is picky about where it'll unzip to (CWD), so we'll just Cd there command_list.append('cd ./deploy.logs/'+remote_host+'/') # now untar. if deploy_main.verbosity >=1 then we'll be verbose if deploy_main.verbosity >=1: command_list.append('tar -xvvf '+remote_host+'.tgz') else: command_list.append('tar -xf '+remote_host+'.tgz') # not make command string by joining the list elements with '; ' command_string = '; '.join(command_list) # execute string out, err, retvalue = deploy_helper.shellexec2(command_string) deploy_logging.log('Downloading logs', 'Logs from '+remote_host+' are ready') # we no longer need the tar file, just hogging up space os.remove('./deploy.logs/'+remote_host+'/'+remote_host+'.tgz') except Exception, e: if deploy_main.verbosity == 2: # Only log if we error and need to narrow this down. otherwise, # it gets really spammy. deploy_logging.logerror(remote_host+": Some kind of err in remote_get_log. ("+\ remote_host+") , error:"+str(e)+")")
def deploy(): """ <Purpose> This function is the brains behind the deploy script. All the main calls originate from this function. -Gets list of remote hosts from a file -Calls function to execute cleanup/setup on remote hosts before we can run remote scripts and then that same function executes the remote script files <Arguments> None. <Exceptions> Exit if hostlist file was not found. <Side Effects> None. <Returns> None. """ # Get list of hosts myhosts = get_remote_hosts_from_file() if not myhosts: # if we didn't find any hosts.. crap out! print "Didn't find any remote hosts file!" deploy_logging.logerror("Didn't find any remote hosts file!") # return if we don't have instructional machines to process if 'machine_list' not in deploy_threading.thread_communications.keys(): return else: # check if we also have intructional machines, and if we do, then # make sure we're not being tricked - remove all instructional machines # from the myhosts list if 'machine_list' in deploy_threading.thread_communications.keys(): # we have instructional machines machine_list = deploy_threading.thread_communications[ 'machine_list'] myhosts = list(set(myhosts) - set(machine_list)) # initialize thread_communications dictionary to a list which will have # our unreachable hosts deploy_threading.thread_communications['unreachable_host'] = [] # this will keep track of the proc id's that are launched on different # threads. These are ssh/scp processes. We keep track of these because # we want to make sure that when we exit deploy.py, we kill all of these # processes - they should be killed by that time unless there was some kind # of error. deploy_threading.thread_communications['running_process_ids'] = [] # initial run connect_and_do_work(myhosts) # now do the same for the instructional machines if we have any: if 'machine_list' in deploy_threading.thread_communications.keys(): connect_and_do_work( deploy_threading.thread_communications['machine_list'], 3) # if we had unreachable hosts.. if deploy_threading.has_unreachable_hosts(): # Currently, set NOT to retry hosts. Since it's running regularly as a service, # there is no need as 99% of these hosts time out anyway, so it just takes # a lot longer than it should. for i in range(0): # increase timeout time by 25% each time deploy_network.default_connection_timeout =\ str(int(float(deploy_network.default_connection_timeout) * 1.25)) # 1. use list of unreachable hosts list as our list to retry last_failed_hosts = deploy_threading.thread_communications[ 'unreachable_host'] # 2. reset the unreachable hosts list deploy_threading.thread_communications['unreachable_host'] = [] deploy_logging.log( "Notice", "Trying to connect to failed hosts (connection attempt #" + str(i + 2) + ")") connect_and_do_work(last_failed_hosts) print "Checking that all child threads/processes are dead..." for each_tuple in deploy_threading.thread_communications[ 'running_process_ids']: try: # tuple is (pid, expiretime, remotehost, username) procid = int(each_tuple[0]) os.kill(procid, 9) except OSError, ose: pass except Exception, e: print "Something went wrong while trying to kill process "+\ str(procid)+", "+str(e)