def main(_): '''main function''' cluster_dir = os.path.join(FLAGS.expdir, 'cluster') #the chief of the job should write the cluster id if int(FLAGS.pid) == 0: with open(os.path.join(cluster_dir, '%s-cid' % FLAGS.job_name) , 'w') as fid: fid.write(FLAGS.cid) #wait for the preceeding cluster tasks to report machines = cluster.get_machines(cluster_dir) while len(machines[FLAGS.job_name]) < int(FLAGS.pid): machines = cluster.get_machines(cluster_dir) sleep(1) port = 1024 ip = socket.gethostbyname(socket.gethostname()) machine_file = '%s/%s-%d' % (cluster_dir, ip, port) #look for an available port while os.path.exists(machine_file) or not cluster.port_available(port): port += 1 machine_file = '%s/%s-%d' % (cluster_dir, ip, port) #report that the machine is ready with open(machine_file, 'w') as fid: fid.write(FLAGS.job_name) #wait untill the main process has given a go print 'waiting for cluster to be ready...' #read the task_index in the created file while not os.path.exists(cluster_dir + '/ready'): sleep(1) print 'cluster is ready' #start the training process train(clusterfile=cluster_dir + '/cluster', job_name=FLAGS.job_name, task_index=int(FLAGS.pid), ssh_command=FLAGS.ssh_command, expdir=FLAGS.expdir) #delete the file to notify that the porcess has finished os.remove(machine_file)
def main(expdir, recipe, mode, computing): '''main function''' if expdir is None: raise Exception('no expdir specified. Command usage: ' 'nabu data --expdir=/path/to/recipe ' '--recipe=/path/to/recipe') if recipe is None: raise Exception('no recipe specified. Command usage: ' 'nabu data --expdir=/path/to/recipe ' '--recipe=/path/to/recipe') if not os.path.isdir(recipe): raise Exception('cannot find recipe %s' % recipe) if mode not in ['non_distributed', 'single_machine', 'multi_machine']: raise Exception('unknown distributed mode: %s' % mode) if computing not in ['standard', 'condor']: raise Exception('unknown computing mode: %s' % computing) database_cfg_file = os.path.join(recipe, 'database.cfg') model_cfg_file = os.path.join(recipe, 'model.cfg') trainer_cfg_file = os.path.join(recipe, 'trainer.cfg') evaluator_cfg_file = os.path.join(recipe, 'validation_evaluator.cfg') if os.path.isdir(expdir): text = '' while text not in ('o', 'r'): text = raw_input('%s already exists, do you want to ' 'resume training (r) or overwrite (o) ' '(respond with o or r)' % expdir) if text == 'o': while text not in ('y', 'n'): text = raw_input('%s will be deleted, are you sure (y or n)' % expdir) if text == 'y': shutil.rmtree(expdir) else: return 0 if not os.path.isdir(expdir): os.makedirs(expdir) os.makedirs(os.path.join(expdir, 'model')) #copy the configs to the expdir so they can be read there and the #experiment information is stored shutil.copyfile(database_cfg_file, os.path.join(expdir, 'database.cfg')) shutil.copyfile(model_cfg_file, os.path.join(expdir, 'model.cfg')) shutil.copyfile(evaluator_cfg_file, os.path.join(expdir, 'validation_evaluator.cfg')) shutil.copyfile(trainer_cfg_file, os.path.join(expdir, 'trainer.cfg')) computing_cfg_file = 'config/computing/%s/%s.cfg' % (computing, mode) if os.path.isdir(os.path.join(expdir, 'processes')): shutil.rmtree(os.path.join(expdir, 'processes')) os.makedirs(os.path.join(expdir, 'processes')) if computing == 'standard': if mode == 'non_distributed': train(clusterfile=None, job_name='local', task_index=0, ssh_command='None', expdir=expdir) elif mode == 'single_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) #create the directories if os.path.isdir(os.path.join(expdir, 'cluster')): shutil.rmtree(os.path.join(expdir, 'cluster')) os.makedirs(os.path.join(expdir, 'cluster')) GPUs = computing_cfg['gpus'].split(' ') #create the cluster file with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%s\n' % (port, GPUs[i])) port += 1 #start the training local_cluster.local_cluster(expdir) elif mode == 'multi_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) #read the cluster file machines = dict() machines['worker'] = [] machines['ps'] = [] with open(computing_cfg['clusterfile']) as fid: for line in fid: if line.strip(): split = line.strip().split(',') hostip = socket.gethostbyname(split[1]) machines[split[0]].append(hostip) #create the outputs directory if os.path.isdir(os.path.join(expdir, 'cluster')): shutil.rmtree(os.path.join(expdir, 'cluster')) os.makedirs(os.path.join(expdir, 'cluster')) #run all the jobs processes = dict() processes['worker'] = [] processes['ps'] = [] for job in machines: task_index = 0 for machine in machines[job]: command = ( 'python -u nabu/scripts/train.py ' '--clusterfile=%s ' '--job_name=%s --task_index=%d --ssh_command=%s ' '--expdir=%s') % (computing_cfg['clusterfile'], job, task_index, computing_cfg['ssh_command'], expdir) processes[job].append( run_remote.run_remote(command=command, host=machine)) task_index += 1 #make sure the created processes are terminated at exit for job in processes: for process in processes[job]: atexit.register(cond_term, process=process) #make sure all remotely created processes are terminated at exit atexit.register(kill_processes.kill_processes, processdir=os.path.join(expdir, 'processes')) #wait for all worker processes to finish for process in processes['worker']: process.wait() else: raise Exception('unknown mode %s' % mode) elif computing == 'condor': if not os.path.isdir(os.path.join(expdir, 'outputs')): os.makedirs(os.path.join(expdir, 'outputs')) if mode == 'non_distributed': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) subprocess.call([ 'condor_submit', 'expdir=%s' % expdir, 'script=nabu/scripts/train.py', 'memory=%s' % computing_cfg['minmemory'], 'nabu/computing/condor/non_distributed.job' ]) elif mode == 'single_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) if os.path.isdir(os.path.join(expdir, 'cluster')): shutil.rmtree(os.path.join(expdir, 'cluster')) os.makedirs(os.path.join(expdir, 'cluster')) #create the cluster file with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%d\n' % (port, i)) port += 1 #submit the job subprocess.call([ 'condor_submit', 'expdir=%s' % expdir, 'GPUs=%d' % (int(computing_cfg['numworkers'])), 'memory=%s' % computing_cfg['minmemory'], 'nabu/computing/condor/local.job' ]) print('job submitted look in %s/outputs for the job outputs' % expdir) elif mode == 'multi_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) if os.path.isdir(os.path.join(expdir, 'cluster')): shutil.rmtree(os.path.join(expdir, 'cluster')) os.makedirs(os.path.join(expdir, 'cluster')) #submit the parameter server jobs subprocess.call([ 'condor_submit', 'expdir=%s' % expdir, 'numjobs=%s' % computing_cfg['numps'], 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/computing/condor/ps.job' ]) #submit the worker jobs subprocess.call([ 'condor_submit', 'expdir=%s' % expdir, 'numjobs=%s' % computing_cfg['numworkers'], 'memory=%s' % computing_cfg['minmemory'], 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/computing/condor/worker.job' ]) ready = False try: print 'waiting for the machines to report...' numworkers = 0 numps = 0 while not ready: #check the machines in the cluster machines = cluster.get_machines( os.path.join(expdir, 'cluster')) if (len(machines['ps']) > numps or len(machines['worker']) > numworkers): numworkers = len(machines['worker']) numps = len(machines['ps']) print('parameter servers ready %d/%s' % (len(machines['ps']), computing_cfg['numps'])) print('workers ready %d/%s' % (len( machines['worker']), computing_cfg['numworkers'])) print 'press Ctrl-C to run with the current machines' #check if the required amount of machines has reported if (len(machines['worker']) == int( computing_cfg['numworkers']) and len(machines['ps']) == int( computing_cfg['numps'])): ready = True sleep(1) except KeyboardInterrupt: #remove all jobs that are not running os.system('condor_rm -constraint \'JobStatus =!= 2\'') #check if enough machines are available if not machines['worker'] or not machines['ps']: #stop the ps jobs cidfile = os.path.join(expdir, 'cluster', 'ps-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) #stop the worker jobs cidfile = os.path.join(expdir, 'cluster', 'worker-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) raise Exception('at leat one ps and one worker needed') print( 'starting training with %s parameter servers and %s workers' % (len(machines['ps']), len(machines['worker']))) #create the cluster file with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as cfid: for job in machines: if job == 'ps': GPU = '' else: GPU = '0' for machine in machines[job]: cfid.write('%s,%s,%d,%s\n' % (job, machine[0], machine[1], GPU)) #notify the machine that the cluster is ready open(os.path.join(expdir, 'cluster', 'ready'), 'w').close() print( 'training has started look in %s/outputs for the job outputs' % expdir) else: raise Exception('unknown mode %s' % mode) else: raise Exception('Unknown computing type %s' % computing)
def main(expdir, recipe, mode, computing, resume, duplicates): '''main function''' if expdir is None: raise Exception('no expdir specified. Command usage: ' 'nabu data --expdir=/path/to/recipe ' '--recipe=/path/to/recipe') if recipe is None: raise Exception('no recipe specified. Command usage: ' 'nabu data --expdir=/path/to/recipe ' '--recipe=/path/to/recipe') if not os.path.isdir(recipe): raise Exception('cannot find recipe %s' % recipe) if mode not in ['non_distributed', 'single_machine', 'multi_machine']: raise Exception('unknown distributed mode: %s' % mode) if computing not in ['standard', 'condor']: raise Exception('unknown computing mode: %s' % computing) duplicates = int(duplicates) database_cfg_file = os.path.join(recipe, 'database.conf') model_cfg_file = os.path.join(recipe, 'model.cfg') trainer_cfg_file = os.path.join(recipe, 'trainer.cfg') evaluator_cfg_file = os.path.join(recipe, 'validation_evaluator.cfg') #read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(trainer_cfg_file) trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) for dupl_ind in range(duplicates): if duplicates > 1: expdir_run = expdir + '_dupl%i' % (dupl_ind) else: expdir_run = expdir if os.path.isdir(os.path.join(expdir_run, 'processes')): shutil.rmtree(os.path.join(expdir_run, 'processes')) os.makedirs(os.path.join(expdir_run, 'processes')) if resume == 'True': if not os.path.isdir(expdir_run): raise Exception( 'cannot find %s, please set resume to ' 'False if you want to start a new training process' % expdir_run) else: if os.path.isdir(os.path.join(expdir_run, 'logdir')): shutil.rmtree(os.path.join(expdir_run, 'logdir')) if not os.path.isdir(expdir_run): os.makedirs(expdir_run) if os.path.isdir(os.path.join(expdir_run, 'model')): shutil.rmtree(os.path.join(expdir_run, 'model')) os.makedirs(os.path.join(expdir_run, 'model')) if 'segment_lengths' in trainer_cfg: #create a separate directory for each training stage segment_lengths = trainer_cfg['segment_lengths'].split(' ') for seg_length in segment_lengths: seg_expdir_run = os.path.join(expdir_run, seg_length) if os.path.isdir(os.path.join(seg_expdir_run, 'logdir')): shutil.rmtree(os.path.join(seg_expdir_run, 'logdir')) if not os.path.isdir(seg_expdir_run): os.makedirs(seg_expdir_run) if os.path.isdir(os.path.join(seg_expdir_run, 'model')): shutil.rmtree(os.path.join(seg_expdir_run, 'model')) os.makedirs(os.path.join(seg_expdir_run, 'model')) #copy the configs to the expdir_run so they can be read there and the #experiment information is stored shutil.copyfile(database_cfg_file, os.path.join(expdir_run, 'database.cfg')) shutil.copyfile(model_cfg_file, os.path.join(expdir_run, 'model.cfg')) shutil.copyfile(evaluator_cfg_file, os.path.join(expdir_run, 'evaluator.cfg')) shutil.copyfile(trainer_cfg_file, os.path.join(expdir_run, 'trainer.cfg')) if 'segment_lengths' in trainer_cfg: #create designated database and trainer config files for each training stage batch_size_perseg = trainer_cfg['batch_size'].split(' ') numbatches_to_aggregate_perseg = trainer_cfg[ 'numbatches_to_aggregate'].split(' ') initial_learning_rate_perseg = trainer_cfg[ 'initial_learning_rate'].split(' ') learning_rate_decay_perseg = trainer_cfg[ 'learning_rate_decay'].split(' ') if len(learning_rate_decay_perseg) == 1: learning_rate_decay_perseg = learning_rate_decay_perseg * len( segment_lengths) parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(database_cfg_file) for i, seg_length in enumerate(segment_lengths): seg_expdir_run = os.path.join(expdir_run, seg_length) segment_parsed_trainer_cfg = configparser.ConfigParser() segment_parsed_trainer_cfg.read(trainer_cfg_file) segment_parsed_trainer_cfg.set('trainer', 'batch_size', batch_size_perseg[i]) segment_parsed_trainer_cfg.set( 'trainer', 'numbatches_to_aggregate', numbatches_to_aggregate_perseg[i]) segment_parsed_trainer_cfg.set( 'trainer', 'initial_learning_rate', initial_learning_rate_perseg[i]) segment_parsed_trainer_cfg.set( 'trainer', 'learning_rate_decay', learning_rate_decay_perseg[i]) with open(os.path.join(seg_expdir_run, 'trainer.cfg'), 'w') as fid: segment_parsed_trainer_cfg.write(fid) segment_parsed_database_cfg = configparser.ConfigParser() segment_parsed_database_cfg.read(database_cfg_file) for section in segment_parsed_database_cfg.sections(): if 'store_dir' in dict( segment_parsed_database_cfg.items( section)).keys(): segment_parsed_database_cfg.set( section, 'store_dir', os.path.join( segment_parsed_database_cfg.get( section, 'store_dir'), seg_length)) with open(os.path.join(seg_expdir_run, 'database.cfg'), 'w') as fid: segment_parsed_database_cfg.write(fid) computing_cfg_file = 'config/computing/%s/%s.cfg' % (computing, mode) if computing == 'standard': if mode == 'non_distributed': #manualy set for machine os.environ['CUDA_VISIBLE_DEVICES'] = '0' train(clusterfile=None, job_name='local', task_index=0, ssh_command='None', expdir=expdir_run) elif mode == 'single_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) #create the directories if os.path.isdir(os.path.join(expdir_run, 'cluster')): shutil.rmtree(os.path.join(expdir_run, 'cluster')) os.makedirs(os.path.join(expdir_run, 'cluster')) GPUs = computing_cfg['gpus'].split(' ') #create the cluster file with open(os.path.join(expdir_run, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%s\n' % (port, GPUs[i])) port += 1 #start the training local_cluster.local_cluster(expdir_run) elif mode == 'multi_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) #read the cluster file machines = dict() machines['worker'] = [] machines['ps'] = [] with open(computing_cfg['clusterfile']) as fid: for line in fid: if line.strip(): split = line.strip().split(',') hostip = socket.gethostbyname(split[1]) machines[split[0]].append(hostip) #create the outputs directory if os.path.isdir(os.path.join(expdir_run, 'cluster')): shutil.rmtree(os.path.join(expdir_run, 'cluster')) os.makedirs(os.path.join(expdir_run, 'cluster')) #run all the jobs processes = dict() processes['worker'] = [] processes['ps'] = [] for job in machines: task_index = 0 for machine in machines[job]: command = ( 'python -u nabu/scripts/train.py ' '--clusterfile=%s ' '--job_name=%s --task_index=%d --ssh_command=%s ' '--expdir=%s') % ( computing_cfg['clusterfile'], job, task_index, computing_cfg['ssh_command'], expdir_run) processes[job].append( run_remote.run_remote(command=command, host=machine)) task_index += 1 #make sure the created processes are terminated at exit for job in processes: for process in processes[job]: atexit.register(cond_term, process=process) #make sure all remotely created processes are terminated at exit atexit.register(kill_processes.kill_processes, processdir=os.path.join( expdir_run, 'processes')) #wait for all worker processes to finish for process in processes['worker']: process.wait() else: raise Exception('unknown mode %s' % mode) elif computing == 'condor': if not os.path.isdir(os.path.join(expdir_run, 'outputs')): os.makedirs(os.path.join(expdir_run, 'outputs')) if mode == 'non_distributed': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) subprocess.call([ 'condor_submit', 'expdir=%s' % expdir_run, 'script=nabu/scripts/train.py', 'memory=%s' % computing_cfg['minmemory'], 'nabu/computing/condor/non_distributed.job' ]) elif mode == 'single_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) if os.path.isdir(os.path.join(expdir_run, 'cluster')): shutil.rmtree(os.path.join(expdir_run, 'cluster')) os.makedirs(os.path.join(expdir_run, 'cluster')) #create the cluster file with open(os.path.join(expdir_run, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%d\n' % (port, i)) port += 1 #submit the job subprocess.call([ 'condor_submit', 'expdir=%s' % expdir_run, 'GPUs=%d' % (int(computing_cfg['numworkers'])), 'memory=%s' % computing_cfg['minmemory'], 'nabu/computing/condor/local.job' ]) print('job submitted look in %s/outputs for the job outputs' % expdir_run) elif mode == 'multi_machine': #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) if os.path.isdir(os.path.join(expdir_run, 'cluster')): shutil.rmtree(os.path.join(expdir_run, 'cluster')) os.makedirs(os.path.join(expdir_run, 'cluster')) #submit the parameter server jobs subprocess.call([ 'condor_submit', 'expdir=%s' % expdir_run, 'numjobs=%s' % computing_cfg['numps'], 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/computing/condor/ps.job' ]) #submit the worker jobs subprocess.call([ 'condor_submit', 'expdir=%s' % expdir_run, 'numjobs=%s' % computing_cfg['numworkers'], 'memory=%s' % computing_cfg['minmemory'], 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/computing/condor/worker.job' ]) ready = False try: print 'waiting for the machines to report...' numworkers = 0 numps = 0 while not ready: #check the machines in the cluster machines = cluster.get_machines( os.path.join(expdir_run, 'cluster')) if (len(machines['ps']) > numps or len(machines['worker']) > numworkers): numworkers = len(machines['worker']) numps = len(machines['ps']) print( 'parameter servers ready %d/%s' % (len(machines['ps']), computing_cfg['numps'])) print('workers ready %d/%s' % (len(machines['worker']), computing_cfg['numworkers'])) print 'press Ctrl-C to run with the current machines' #check if the required amount of machines has reported if (len(machines['worker']) == int( computing_cfg['numworkers']) and len(machines['ps']) == int( computing_cfg['numps'])): ready = True sleep(1) except KeyboardInterrupt: #remove all jobs that are not running os.system('condor_rm -constraint \'JobStatus =!= 2\'') #check if enough machines are available if not machines['worker'] or not machines['ps']: #stop the ps jobs cidfile = os.path.join(expdir_run, 'cluster', 'ps-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) #stop the worker jobs cidfile = os.path.join(expdir_run, 'cluster', 'worker-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) raise Exception('at leat one ps and one worker needed') print( 'starting training with %s parameter servers and %s workers' % (len(machines['ps']), len(machines['worker']))) #create the cluster file with open(os.path.join(expdir_run, 'cluster', 'cluster'), 'w') as cfid: for job in machines: if job == 'ps': GPU = '' else: GPU = '0' for machine in machines[job]: cfid.write('%s,%s,%d,%s\n' % (job, machine[0], machine[1], GPU)) #notify the machine that the cluster is ready open(os.path.join(expdir_run, 'cluster', 'ready'), 'w').close() print( 'training has started look in %s/outputs for the job outputs' % expdir_run) else: raise Exception('unknown mode %s' % mode) else: raise Exception('Unknown computing type %s' % computing)