def main(_): '''main function''' cluster_dir = os.path.join(FLAGS.expdir, 'cluster') #the chief of the job should write the cluster id if int(FLAGS.pid) == 0: with open(os.path.join(cluster_dir, '%s-cid' % FLAGS.job_name), 'w') as fid: fid.write(FLAGS.cid) #wait for the preceeding cluster tasks to report machines = cluster.get_machines(cluster_dir) while len(machines[FLAGS.job_name]) < int(FLAGS.pid): machines = cluster.get_machines(cluster_dir) sleep(1) port = 1024 machine_file = '%s/%s-%d' % (cluster_dir, socket.gethostname(), port) #look for an available port while os.path.exists(machine_file) or not cluster.port_available(port): port += 1 machine_file = '%s/%s-%d' % (cluster_dir, socket.gethostname(), port) #report that the machine is ready with open(machine_file, 'w') as fid: fid.write(FLAGS.job_name) #wait untill the main process has given a go print 'waiting for cluster to be ready...' #read the task_index in the created file while not os.path.exists(cluster_dir + '/ready'): sleep(1) print 'cluster is ready' #start the training process if FLAGS.type == 'asr': train_asr(clusterfile=cluster_dir + '/cluster', job_name=FLAGS.job_name, task_index=int(FLAGS.pid), ssh_command=FLAGS.ssh_command, expdir=FLAGS.expdir) else: train_lm(clusterfile=cluster_dir + '/cluster', job_name=FLAGS.job_name, task_index=int(FLAGS.pid), ssh_command=FLAGS.ssh_command, expdir=FLAGS.expdir) #delete the file to notify that the porcess has finished os.remove(machine_file)
def create_server(clusterfile, job_name, task_index, expdir, ssh_command): '''creates the tensorflow cluster and server based on the clusterfile Args: clusterfile: the path to the clusterfile job_name: the name of the job task_index: the task index expdir: the experiments directory ssh_command: the command to use for ssh, if 'None' no tunnel will be created Returns: a tensorflow server''' if clusterfile is None: #no distributed training server = tf.train.Server.create_local_server() else: #read the cluster file machines = cluster.read_cluster(clusterfile) #build the cluster and create ssh tunnels to machines in the cluster port = 1024 localmachine = machines[job_name][task_index][0] #report that this job is running open( os.path.join(expdir, 'processes', '%s-%d' % (localmachine, os.getpid())), 'w').close() #specify the GPU that should be used localGPU = machines[job_name][task_index][2] os.environ['CUDA_VISIBLE_DEVICES'] = localGPU #get a list of ports used on this machine localports = [] for job in machines: for remote in machines[job]: if localmachine == remote[0] or remote[0] == 'localhost': localports.append(remote[1]) #check if this task is the first one in the cluster First = True if job_name == 'worker': for machine in machines['ps'] + machines['worker'][:task_index]: First = First and not machine[0] == localmachine else: for machine in machines['ps'][:task_index]: First = First and not machine[0] == localmachine #the first task on a machine will create the cluster for this machine machinecluster = os.path.join(expdir, 'cluster', '%s-cluster' % localmachine) readyfile = os.path.join(expdir, 'cluster', '%s-ready' % localmachine) if First: with open(machinecluster, 'w') as fid: for job in machines: for remote in machines[job]: #create an ssh tunnel if the local machine is not the #same as the remote machine if localmachine != remote[0] and ssh_command != 'None': #look for an available port while (port in localports or not cluster.port_available(port)): port += 1 #create the ssh tunnel p = subprocess.Popen([ ssh_command, '-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null', '-L', '%d:127.0.0.1:%d' % (port, remote[1]), '-N', remote[0] ]) #report that the ssh tunnel is running open( os.path.join(expdir, 'processes', '%s-%d' % (localmachine, p.pid)), 'w').close() fid.write('%s,localhost,%s,%s\n' % (job, port, remote[2])) port += 1 #give the machine some time to open the ssh tunnel #before opening a new one sleep(0.1) else: if localmachine == remote[0]: host = 'localhost' else: host = remote[0] fid.write('%s,%s,%s,%s\n' % (job, host, remote[1], remote[2])) #notify that the cluster is ready open(readyfile, 'w').close() #wait for the clusterfile to be ready while not os.path.exists(readyfile): sleep(1) #read the cluster file machines = cluster.read_cluster(machinecluster) clusterdict = dict() clusterdict['worker'] = [] clusterdict['ps'] = [] for job in machines: for remote in machines[job]: clusterdict[job].append('%s:%d' % (remote[0], remote[1])) #create the cluster tfcluster = tf.train.ClusterSpec(clusterdict) #create the server for this task server = tf.train.Server(tfcluster, job_name, task_index) return server
def main(_): '''main function''' #pointers to the config files computing_cfg_file = 'config/computing/non_distributed.cfg' database_cfg_file = 'config/asr_databases/TIMIT.conf' if FLAGS.type == 'asr': feat_cfg_file = 'config/features/fbank.cfg' classifier_cfg_file = 'config/asr/LASACNN.cfg' trainer_cfg_file = 'config/trainer/cross_entropytrainer.cfg' decoder_cfg_file = 'config/decoder/BeamSearchDecoder.cfg' #read the computing config file parsed_computing_cfg = configparser.ConfigParser() parsed_computing_cfg.read(computing_cfg_file) computing_cfg = dict(parsed_computing_cfg.items('computing')) #read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(trainer_cfg_file) trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) if os.path.isdir(os.path.join(FLAGS.expdir, 'processes')): shutil.rmtree(os.path.join(FLAGS.expdir, 'processes')) os.makedirs(os.path.join(FLAGS.expdir, 'processes')) if trainer_cfg['resume_training'] == 'True': if not os.path.isdir(FLAGS.expdir): raise Exception( 'cannot find %s, please set resume_training to ' 'False if you want to start a new training process' % FLAGS.expdir) else: if os.path.isdir(os.path.join(FLAGS.expdir, 'logdir')): shutil.rmtree(os.path.join(FLAGS.expdir, 'logdir')) if not os.path.isdir(FLAGS.expdir): os.makedirs(FLAGS.expdir) if not os.path.isdir(os.path.join(FLAGS.expdir, 'model')): os.makedirs(os.path.join(FLAGS.expdir, 'model')) #copy the configs to the expdir so they can be read there and the #experiment information is stored shutil.copyfile(database_cfg_file, os.path.join(FLAGS.expdir, 'database.cfg')) if FLAGS.type == 'asr': shutil.copyfile( feat_cfg_file, os.path.join(FLAGS.expdir, 'model', 'features.cfg')) shutil.copyfile( classifier_cfg_file, os.path.join(FLAGS.expdir, 'model', '%s.cfg' % FLAGS.type)) shutil.copyfile(computing_cfg_file, os.path.join(FLAGS.expdir, 'computing.cfg')) shutil.copyfile(trainer_cfg_file, os.path.join(FLAGS.expdir, 'trainer.cfg')) shutil.copyfile(decoder_cfg_file, os.path.join(FLAGS.expdir, 'model', 'decoder.cfg')) if computing_cfg['distributed'] == 'condor_non-distributed': if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')): os.makedirs(os.path.join(FLAGS.expdir, 'outputs')) subprocess.call([ 'condor_submit', 'expdir=%s' % FLAGS.expdir, 'memory=%s' % computing_cfg['minmemory'], 'type=%s' % FLAGS.type, 'nabu/distributed/condor/non_distributed.job' ]) elif computing_cfg['distributed'] == 'non-distributed': if FLAGS.type == 'asr': train_asr(clusterfile=None, job_name='local', task_index=0, ssh_command='None', expdir=FLAGS.expdir) else: train_lm(clusterfile=None, job_name='local', task_index=0, ssh_command='None', expdir=FLAGS.expdir) elif computing_cfg['distributed'] == 'local': #create the directories if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')): os.makedirs(os.path.join(FLAGS.expdir, 'outputs')) if not os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')): os.makedirs(os.path.join(FLAGS.expdir, 'cluster')) #create the cluster file with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%d\n' % (port, i)) port += 1 #start the training local_cluster.local_cluster(FLAGS.expdir, FLAGS.type) elif computing_cfg['distributed'] == 'static': #read the cluster file machines = dict() machines['worker'] = [] machines['ps'] = [] with open(computing_cfg['clusterfile']) as fid: for line in fid: if line.strip(): split = line.strip().split(',') machines[split[0]].append(split[1]) #create the outputs directory if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')): os.makedirs(os.path.join(FLAGS.expdir, 'outputs')) #run all the jobs processes = dict() processes['worker'] = [] processes['ps'] = [] for job in machines: task_index = 0 for machine in machines[job]: command = ('python -u train_%s.py --clusterfile=%s ' '--job_name=%s --task_index=%d --ssh_command=%s ' '--expdir=%s') % ( FLAGS.type, computing_cfg['clusterfile'], job, task_index, computing_cfg['ssh_command'], FLAGS.expdir) processes[job].append( run_remote.run_remote(command=command, host=machine)) task_index += 1 #make sure the created processes are terminated at exit for job in processes: for process in processes[job]: atexit.register(process.terminate) #make sure all remotely created processes are terminated at exit atexit.register(kill_processes.kill_processes, processdir=os.path.join(FLAGS.expdir, 'processes')) #wait for all worker processes to finish for process in processes['worker']: process.wait() elif computing_cfg['distributed'] == 'condor': #create the directories if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')): os.makedirs(os.path.join(FLAGS.expdir, 'outputs')) if os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')): shutil.rmtree(os.path.join(FLAGS.expdir, 'cluster')) os.makedirs(os.path.join(FLAGS.expdir, 'cluster')) #submit the parameter server jobs subprocess.call([ 'condor_submit', 'expdir=%s' % FLAGS.expdir, 'numjobs=%s' % computing_cfg['numps'], 'type=%s' % FLAGS.type, 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/distributed/condor/ps.job' ]) #submit the worker jobs subprocess.call([ 'condor_submit', 'expdir=%s' % FLAGS.expdir, 'numjobs=%s' % computing_cfg['numworkers'], 'memory=%s' % computing_cfg['minmemory'], 'type=%s' % FLAGS.type, 'ssh_command=%s' % computing_cfg['ssh_command'], 'nabu/distributed/condor/worker.job' ]) ready = False try: print 'waiting for the machines to report...' numworkers = 0 numps = 0 while not ready: #check the machines in the cluster machines = cluster.get_machines( os.path.join(FLAGS.expdir, 'cluster')) if (len(machines['ps']) > numps or len(machines['worker']) > numworkers): numworkers = len(machines['worker']) numps = len(machines['ps']) print('parameter servers ready %d/%s' % (len(machines['ps']), computing_cfg['numps'])) print( 'workers ready %d/%s' % (len(machines['worker']), computing_cfg['numworkers'])) print 'press Ctrl-C to run with the current machines' #check if the required amount of machines has reported if (len(machines['worker']) == int(computing_cfg['numworkers']) and len(machines['ps']) == int( computing_cfg['numps'])): ready = True sleep(1) except KeyboardInterrupt: #remove all jobs that are not running os.system('condor_rm -constraint \'JobStatus =!= 2\'') #check if enough machines are available if machines['worker'] or machines['ps']: #stop the ps jobs cidfile = os.path.join(FLAGS.expdir, 'cluster', 'ps-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) #stop the worker jobs cidfile = os.path.join(FLAGS.expdir, 'cluster', 'worker-cid') if os.path.exists(cidfile): with open(cidfile) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) raise Exception('at leat one ps and one worker needed') print('starting training with %s parameter servers and %s workers' % (len(machines['ps']), len(machines['worker']))) #create the cluster file with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'), 'w') as cfid: for job in machines: if job == 'ps': GPU = '' else: GPU = '0' for machine in machines[job]: cfid.write('%s,%s,%d,%s\n' % (job, machine[0], machine[1], GPU)) #notify the machine that the cluster is ready fid = open(FLAGS.expdir + '/cluster/ready', 'w') fid.close() print('training has started look in %s/outputs for the job outputs' % FLAGS.expdir) print 'waiting for worker jobs to finish' for machine in machines['worker']: machine_file = os.path.join(FLAGS.expdir, 'cluster', '%s-%d' % (machine[0], machine[1])) while os.path.exists(machine_file): sleep(1) #stop the ps jobs with open(os.path.join(FLAGS.expdir, 'cluster', 'ps-cid')) as fid: cid = fid.read() subprocess.call(['condor_rm', cid]) elif computing_cfg['distributed'] == 'condor_local': #create the directories if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')): os.makedirs(os.path.join(FLAGS.expdir, 'outputs')) if not os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')): os.makedirs(os.path.join(FLAGS.expdir, 'cluster')) #create the cluster file with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'), 'w') as fid: port = 1024 for _ in range(int(computing_cfg['numps'])): while not cluster.port_available(port): port += 1 fid.write('ps,localhost,%d,\n' % port) port += 1 for i in range(int(computing_cfg['numworkers'])): while not cluster.port_available(port): port += 1 fid.write('worker,localhost,%d,%d\n' % (port, i)) port += 1 #submit the job subprocess.call([ 'condor_submit', 'expdir=%s' % FLAGS.expdir, 'GPUs=%d' % (int(computing_cfg['numworkers'])), 'memory=%s' % computing_cfg['minmemory'], 'type=%s' % FLAGS.type, 'nabu/distributed/condor/local.job' ]) print('job submitted look in %s/outputs for the job outputs' % FLAGS.expdir) else: raise Exception('Unknown distributed type in %s' % computing_cfg_file)