示例#1
0
def main(_):
    '''main function'''
    cluster_dir = os.path.join(FLAGS.expdir, 'cluster')

    #the chief of the job should write the cluster id
    if int(FLAGS.pid) == 0:
        with open(os.path.join(cluster_dir, '%s-cid' % FLAGS.job_name),
                  'w') as fid:
            fid.write(FLAGS.cid)

    #wait for the preceeding cluster tasks to report
    machines = cluster.get_machines(cluster_dir)

    while len(machines[FLAGS.job_name]) < int(FLAGS.pid):
        machines = cluster.get_machines(cluster_dir)
        sleep(1)

    port = 1024
    machine_file = '%s/%s-%d' % (cluster_dir, socket.gethostname(), port)

    #look for an available port
    while os.path.exists(machine_file) or not cluster.port_available(port):

        port += 1
        machine_file = '%s/%s-%d' % (cluster_dir, socket.gethostname(), port)

    #report that the machine is ready
    with open(machine_file, 'w') as fid:
        fid.write(FLAGS.job_name)

    #wait untill the main process has given a go
    print 'waiting for cluster to be ready...'

    #read the task_index in the created file
    while not os.path.exists(cluster_dir + '/ready'):
        sleep(1)

    print 'cluster is ready'

    #start the training process
    if FLAGS.type == 'asr':
        train_asr(clusterfile=cluster_dir + '/cluster',
                  job_name=FLAGS.job_name,
                  task_index=int(FLAGS.pid),
                  ssh_command=FLAGS.ssh_command,
                  expdir=FLAGS.expdir)
    else:
        train_lm(clusterfile=cluster_dir + '/cluster',
                 job_name=FLAGS.job_name,
                 task_index=int(FLAGS.pid),
                 ssh_command=FLAGS.ssh_command,
                 expdir=FLAGS.expdir)

    #delete the file to notify that the porcess has finished
    os.remove(machine_file)
示例#2
0
def create_server(clusterfile, job_name, task_index, expdir, ssh_command):
    '''creates the tensorflow cluster and server based on the clusterfile

    Args:
        clusterfile: the path to the clusterfile
        job_name: the name of the job
        task_index: the task index
        expdir: the experiments directory
        ssh_command: the command to use for ssh, if 'None' no tunnel will be
            created

    Returns: a tensorflow server'''

    if clusterfile is None:
        #no distributed training
        server = tf.train.Server.create_local_server()
    else:
        #read the cluster file
        machines = cluster.read_cluster(clusterfile)

        #build the cluster and create ssh tunnels to machines in the cluster
        port = 1024
        localmachine = machines[job_name][task_index][0]

        #report that this job is running
        open(
            os.path.join(expdir, 'processes',
                         '%s-%d' % (localmachine, os.getpid())), 'w').close()

        #specify the GPU that should be used
        localGPU = machines[job_name][task_index][2]
        os.environ['CUDA_VISIBLE_DEVICES'] = localGPU

        #get a list of ports used on this machine
        localports = []
        for job in machines:
            for remote in machines[job]:
                if localmachine == remote[0] or remote[0] == 'localhost':
                    localports.append(remote[1])

        #check if this task is the first one in the cluster
        First = True
        if job_name == 'worker':
            for machine in machines['ps'] + machines['worker'][:task_index]:
                First = First and not machine[0] == localmachine
        else:
            for machine in machines['ps'][:task_index]:
                First = First and not machine[0] == localmachine

        #the first task on a machine will create the cluster for this machine
        machinecluster = os.path.join(expdir, 'cluster',
                                      '%s-cluster' % localmachine)
        readyfile = os.path.join(expdir, 'cluster', '%s-ready' % localmachine)

        if First:
            with open(machinecluster, 'w') as fid:
                for job in machines:
                    for remote in machines[job]:

                        #create an ssh tunnel if the local machine is not the
                        #same as the remote machine
                        if localmachine != remote[0] and ssh_command != 'None':

                            #look for an available port
                            while (port in localports
                                   or not cluster.port_available(port)):

                                port += 1

                            #create the ssh tunnel
                            p = subprocess.Popen([
                                ssh_command, '-o', 'StrictHostKeyChecking=no',
                                '-o', 'UserKnownHostsFile=/dev/null', '-L',
                                '%d:127.0.0.1:%d' % (port, remote[1]), '-N',
                                remote[0]
                            ])

                            #report that the ssh tunnel is running
                            open(
                                os.path.join(expdir, 'processes',
                                             '%s-%d' % (localmachine, p.pid)),
                                'w').close()

                            fid.write('%s,localhost,%s,%s\n' %
                                      (job, port, remote[2]))

                            port += 1

                            #give the machine some time to open the ssh tunnel
                            #before opening a new one
                            sleep(0.1)

                        else:
                            if localmachine == remote[0]:
                                host = 'localhost'
                            else:
                                host = remote[0]
                            fid.write('%s,%s,%s,%s\n' %
                                      (job, host, remote[1], remote[2]))

            #notify that the cluster is ready
            open(readyfile, 'w').close()

        #wait for the clusterfile to be ready
        while not os.path.exists(readyfile):
            sleep(1)

        #read the cluster file
        machines = cluster.read_cluster(machinecluster)

        clusterdict = dict()
        clusterdict['worker'] = []
        clusterdict['ps'] = []
        for job in machines:
            for remote in machines[job]:
                clusterdict[job].append('%s:%d' % (remote[0], remote[1]))

        #create the cluster
        tfcluster = tf.train.ClusterSpec(clusterdict)

        #create the server for this task
        server = tf.train.Server(tfcluster, job_name, task_index)

    return server
示例#3
0
def main(_):
    '''main function'''

    #pointers to the config files
    computing_cfg_file = 'config/computing/non_distributed.cfg'
    database_cfg_file = 'config/asr_databases/TIMIT.conf'
    if FLAGS.type == 'asr':
        feat_cfg_file = 'config/features/fbank.cfg'
    classifier_cfg_file = 'config/asr/LASACNN.cfg'
    trainer_cfg_file = 'config/trainer/cross_entropytrainer.cfg'
    decoder_cfg_file = 'config/decoder/BeamSearchDecoder.cfg'

    #read the computing config file
    parsed_computing_cfg = configparser.ConfigParser()
    parsed_computing_cfg.read(computing_cfg_file)
    computing_cfg = dict(parsed_computing_cfg.items('computing'))

    #read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(trainer_cfg_file)
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    if os.path.isdir(os.path.join(FLAGS.expdir, 'processes')):
        shutil.rmtree(os.path.join(FLAGS.expdir, 'processes'))
    os.makedirs(os.path.join(FLAGS.expdir, 'processes'))

    if trainer_cfg['resume_training'] == 'True':
        if not os.path.isdir(FLAGS.expdir):
            raise Exception(
                'cannot find %s, please set resume_training to '
                'False if you want to start a new training process' %
                FLAGS.expdir)
    else:
        if os.path.isdir(os.path.join(FLAGS.expdir, 'logdir')):
            shutil.rmtree(os.path.join(FLAGS.expdir, 'logdir'))

        if not os.path.isdir(FLAGS.expdir):
            os.makedirs(FLAGS.expdir)

        if not os.path.isdir(os.path.join(FLAGS.expdir, 'model')):
            os.makedirs(os.path.join(FLAGS.expdir, 'model'))

        #copy the configs to the expdir so they can be read there and the
        #experiment information is stored
        shutil.copyfile(database_cfg_file,
                        os.path.join(FLAGS.expdir, 'database.cfg'))
        if FLAGS.type == 'asr':
            shutil.copyfile(
                feat_cfg_file,
                os.path.join(FLAGS.expdir, 'model', 'features.cfg'))
        shutil.copyfile(
            classifier_cfg_file,
            os.path.join(FLAGS.expdir, 'model', '%s.cfg' % FLAGS.type))

    shutil.copyfile(computing_cfg_file,
                    os.path.join(FLAGS.expdir, 'computing.cfg'))
    shutil.copyfile(trainer_cfg_file, os.path.join(FLAGS.expdir,
                                                   'trainer.cfg'))
    shutil.copyfile(decoder_cfg_file,
                    os.path.join(FLAGS.expdir, 'model', 'decoder.cfg'))

    if computing_cfg['distributed'] == 'condor_non-distributed':

        if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')):
            os.makedirs(os.path.join(FLAGS.expdir, 'outputs'))

        subprocess.call([
            'condor_submit',
            'expdir=%s' % FLAGS.expdir,
            'memory=%s' % computing_cfg['minmemory'],
            'type=%s' % FLAGS.type,
            'nabu/distributed/condor/non_distributed.job'
        ])

    elif computing_cfg['distributed'] == 'non-distributed':

        if FLAGS.type == 'asr':
            train_asr(clusterfile=None,
                      job_name='local',
                      task_index=0,
                      ssh_command='None',
                      expdir=FLAGS.expdir)
        else:
            train_lm(clusterfile=None,
                     job_name='local',
                     task_index=0,
                     ssh_command='None',
                     expdir=FLAGS.expdir)

    elif computing_cfg['distributed'] == 'local':

        #create the directories
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')):
            os.makedirs(os.path.join(FLAGS.expdir, 'outputs'))
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')):
            os.makedirs(os.path.join(FLAGS.expdir, 'cluster'))

        #create the cluster file
        with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'),
                  'w') as fid:
            port = 1024
            for _ in range(int(computing_cfg['numps'])):
                while not cluster.port_available(port):
                    port += 1
                fid.write('ps,localhost,%d,\n' % port)
                port += 1
            for i in range(int(computing_cfg['numworkers'])):
                while not cluster.port_available(port):
                    port += 1
                fid.write('worker,localhost,%d,%d\n' % (port, i))
                port += 1

        #start the training
        local_cluster.local_cluster(FLAGS.expdir, FLAGS.type)

    elif computing_cfg['distributed'] == 'static':

        #read the cluster file
        machines = dict()
        machines['worker'] = []
        machines['ps'] = []
        with open(computing_cfg['clusterfile']) as fid:
            for line in fid:
                if line.strip():
                    split = line.strip().split(',')
                    machines[split[0]].append(split[1])

        #create the outputs directory
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')):
            os.makedirs(os.path.join(FLAGS.expdir, 'outputs'))

        #run all the jobs
        processes = dict()
        processes['worker'] = []
        processes['ps'] = []
        for job in machines:
            task_index = 0
            for machine in machines[job]:
                command = ('python -u train_%s.py --clusterfile=%s '
                           '--job_name=%s --task_index=%d --ssh_command=%s '
                           '--expdir=%s') % (
                               FLAGS.type, computing_cfg['clusterfile'], job,
                               task_index, computing_cfg['ssh_command'],
                               FLAGS.expdir)
                processes[job].append(
                    run_remote.run_remote(command=command, host=machine))
                task_index += 1

        #make sure the created processes are terminated at exit
        for job in processes:
            for process in processes[job]:
                atexit.register(process.terminate)

        #make sure all remotely created processes are terminated at exit
        atexit.register(kill_processes.kill_processes,
                        processdir=os.path.join(FLAGS.expdir, 'processes'))

        #wait for all worker processes to finish
        for process in processes['worker']:
            process.wait()

    elif computing_cfg['distributed'] == 'condor':

        #create the directories
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')):
            os.makedirs(os.path.join(FLAGS.expdir, 'outputs'))
        if os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')):
            shutil.rmtree(os.path.join(FLAGS.expdir, 'cluster'))
        os.makedirs(os.path.join(FLAGS.expdir, 'cluster'))

        #submit the parameter server jobs
        subprocess.call([
            'condor_submit',
            'expdir=%s' % FLAGS.expdir,
            'numjobs=%s' % computing_cfg['numps'],
            'type=%s' % FLAGS.type,
            'ssh_command=%s' % computing_cfg['ssh_command'],
            'nabu/distributed/condor/ps.job'
        ])

        #submit the worker jobs
        subprocess.call([
            'condor_submit',
            'expdir=%s' % FLAGS.expdir,
            'numjobs=%s' % computing_cfg['numworkers'],
            'memory=%s' % computing_cfg['minmemory'],
            'type=%s' % FLAGS.type,
            'ssh_command=%s' % computing_cfg['ssh_command'],
            'nabu/distributed/condor/worker.job'
        ])

        ready = False

        try:
            print 'waiting for the machines to report...'
            numworkers = 0
            numps = 0
            while not ready:
                #check the machines in the cluster
                machines = cluster.get_machines(
                    os.path.join(FLAGS.expdir, 'cluster'))

                if (len(machines['ps']) > numps
                        or len(machines['worker']) > numworkers):

                    numworkers = len(machines['worker'])
                    numps = len(machines['ps'])

                    print('parameter servers ready %d/%s' %
                          (len(machines['ps']), computing_cfg['numps']))

                    print(
                        'workers ready %d/%s' %
                        (len(machines['worker']), computing_cfg['numworkers']))

                    print 'press Ctrl-C to run with the current machines'

                #check if the required amount of machines has reported
                if (len(machines['worker']) == int(computing_cfg['numworkers'])
                        and len(machines['ps']) == int(
                            computing_cfg['numps'])):

                    ready = True

                sleep(1)

        except KeyboardInterrupt:

            #remove all jobs that are not running
            os.system('condor_rm -constraint \'JobStatus =!= 2\'')

            #check if enough machines are available
            if machines['worker'] or machines['ps']:

                #stop the ps jobs
                cidfile = os.path.join(FLAGS.expdir, 'cluster', 'ps-cid')
                if os.path.exists(cidfile):
                    with open(cidfile) as fid:
                        cid = fid.read()
                    subprocess.call(['condor_rm', cid])

                #stop the worker jobs
                cidfile = os.path.join(FLAGS.expdir, 'cluster', 'worker-cid')
                if os.path.exists(cidfile):
                    with open(cidfile) as fid:
                        cid = fid.read()
                    subprocess.call(['condor_rm', cid])

                raise Exception('at leat one ps and one worker needed')

        print('starting training with %s parameter servers and %s workers' %
              (len(machines['ps']), len(machines['worker'])))

        #create the cluster file
        with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'),
                  'w') as cfid:
            for job in machines:
                if job == 'ps':
                    GPU = ''
                else:
                    GPU = '0'
                for machine in machines[job]:
                    cfid.write('%s,%s,%d,%s\n' %
                               (job, machine[0], machine[1], GPU))

        #notify the machine that the cluster is ready
        fid = open(FLAGS.expdir + '/cluster/ready', 'w')
        fid.close()

        print('training has started look in %s/outputs for the job outputs' %
              FLAGS.expdir)

        print 'waiting for worker jobs to finish'

        for machine in machines['worker']:
            machine_file = os.path.join(FLAGS.expdir, 'cluster',
                                        '%s-%d' % (machine[0], machine[1]))
            while os.path.exists(machine_file):
                sleep(1)

        #stop the ps jobs
        with open(os.path.join(FLAGS.expdir, 'cluster', 'ps-cid')) as fid:
            cid = fid.read()

        subprocess.call(['condor_rm', cid])

    elif computing_cfg['distributed'] == 'condor_local':

        #create the directories
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'outputs')):
            os.makedirs(os.path.join(FLAGS.expdir, 'outputs'))
        if not os.path.isdir(os.path.join(FLAGS.expdir, 'cluster')):
            os.makedirs(os.path.join(FLAGS.expdir, 'cluster'))

        #create the cluster file
        with open(os.path.join(FLAGS.expdir, 'cluster', 'cluster'),
                  'w') as fid:
            port = 1024
            for _ in range(int(computing_cfg['numps'])):
                while not cluster.port_available(port):
                    port += 1
                fid.write('ps,localhost,%d,\n' % port)
                port += 1
            for i in range(int(computing_cfg['numworkers'])):
                while not cluster.port_available(port):
                    port += 1
                fid.write('worker,localhost,%d,%d\n' % (port, i))
                port += 1

        #submit the job
        subprocess.call([
            'condor_submit',
            'expdir=%s' % FLAGS.expdir,
            'GPUs=%d' % (int(computing_cfg['numworkers'])),
            'memory=%s' % computing_cfg['minmemory'],
            'type=%s' % FLAGS.type, 'nabu/distributed/condor/local.job'
        ])

        print('job submitted look in %s/outputs for the job outputs' %
              FLAGS.expdir)

    else:
        raise Exception('Unknown distributed type in %s' % computing_cfg_file)