예제 #1
0
def main(_):
    '''main function'''
    cluster_dir = os.path.join(FLAGS.expdir, 'cluster')

    #the chief of the job should write the cluster id
    if int(FLAGS.pid) == 0:
        with open(os.path.join(cluster_dir, '%s-cid' % FLAGS.job_name)
                  , 'w') as fid:
            fid.write(FLAGS.cid)

    #wait for the preceeding cluster tasks to report
    machines = cluster.get_machines(cluster_dir)

    while len(machines[FLAGS.job_name]) < int(FLAGS.pid):
        machines = cluster.get_machines(cluster_dir)
        sleep(1)

    port = 1024
    ip = socket.gethostbyname(socket.gethostname())
    machine_file = '%s/%s-%d' % (cluster_dir, ip, port)

    #look for an available port
    while os.path.exists(machine_file) or not cluster.port_available(port):

        port += 1
        machine_file = '%s/%s-%d' % (cluster_dir, ip, port)

    #report that the machine is ready
    with open(machine_file, 'w') as fid:
        fid.write(FLAGS.job_name)

    #wait untill the main process has given a go
    print 'waiting for cluster to be ready...'

    #read the task_index in the created file
    while not os.path.exists(cluster_dir + '/ready'):
        sleep(1)

    print 'cluster is ready'

    #start the training process
    train(clusterfile=cluster_dir + '/cluster',
          job_name=FLAGS.job_name,
          task_index=int(FLAGS.pid),
          ssh_command=FLAGS.ssh_command,
          expdir=FLAGS.expdir)

    #delete the file to notify that the porcess has finished
    os.remove(machine_file)
예제 #2
0
def main(expdir, recipe, mode, computing):
    '''main function'''

    if expdir is None:
        raise Exception('no expdir specified. Command usage: '
                        'nabu data --expdir=/path/to/recipe '
                        '--recipe=/path/to/recipe')

    if recipe is None:
        raise Exception('no recipe specified. Command usage: '
                        'nabu data --expdir=/path/to/recipe '
                        '--recipe=/path/to/recipe')

    if not os.path.isdir(recipe):
        raise Exception('cannot find recipe %s' % recipe)
    if mode not in ['non_distributed', 'single_machine', 'multi_machine']:
        raise Exception('unknown distributed mode: %s' % mode)
    if computing not in ['standard', 'condor']:
        raise Exception('unknown computing mode: %s' % computing)

    database_cfg_file = os.path.join(recipe, 'database.cfg')
    model_cfg_file = os.path.join(recipe, 'model.cfg')
    trainer_cfg_file = os.path.join(recipe, 'trainer.cfg')
    evaluator_cfg_file = os.path.join(recipe, 'validation_evaluator.cfg')

    if os.path.isdir(expdir):
        text = ''
        while text not in ('o', 'r'):
            text = raw_input('%s already exists, do you want to '
                             'resume training (r) or overwrite (o) '
                             '(respond with o or r)' % expdir)
        if text == 'o':
            while text not in ('y', 'n'):
                text = raw_input('%s will be deleted, are you sure (y or n)' %
                                 expdir)
            if text == 'y':
                shutil.rmtree(expdir)
            else:
                return 0

    if not os.path.isdir(expdir):

        os.makedirs(expdir)
        os.makedirs(os.path.join(expdir, 'model'))

        #copy the configs to the expdir so they can be read there and the
        #experiment information is stored

        shutil.copyfile(database_cfg_file,
                        os.path.join(expdir, 'database.cfg'))
        shutil.copyfile(model_cfg_file, os.path.join(expdir, 'model.cfg'))
        shutil.copyfile(evaluator_cfg_file,
                        os.path.join(expdir, 'validation_evaluator.cfg'))

        shutil.copyfile(trainer_cfg_file, os.path.join(expdir, 'trainer.cfg'))

    computing_cfg_file = 'config/computing/%s/%s.cfg' % (computing, mode)

    if os.path.isdir(os.path.join(expdir, 'processes')):
        shutil.rmtree(os.path.join(expdir, 'processes'))
    os.makedirs(os.path.join(expdir, 'processes'))

    if computing == 'standard':

        if mode == 'non_distributed':

            train(clusterfile=None,
                  job_name='local',
                  task_index=0,
                  ssh_command='None',
                  expdir=expdir)

        elif mode == 'single_machine':

            #read the computing config file
            parsed_computing_cfg = configparser.ConfigParser()
            parsed_computing_cfg.read(computing_cfg_file)
            computing_cfg = dict(parsed_computing_cfg.items('computing'))

            #create the directories
            if os.path.isdir(os.path.join(expdir, 'cluster')):
                shutil.rmtree(os.path.join(expdir, 'cluster'))
            os.makedirs(os.path.join(expdir, 'cluster'))

            GPUs = computing_cfg['gpus'].split(' ')

            #create the cluster file
            with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as fid:
                port = 1024
                for _ in range(int(computing_cfg['numps'])):
                    while not cluster.port_available(port):
                        port += 1
                    fid.write('ps,localhost,%d,\n' % port)
                    port += 1
                for i in range(int(computing_cfg['numworkers'])):
                    while not cluster.port_available(port):
                        port += 1
                    fid.write('worker,localhost,%d,%s\n' % (port, GPUs[i]))
                    port += 1

            #start the training
            local_cluster.local_cluster(expdir)

        elif mode == 'multi_machine':

            #read the computing config file
            parsed_computing_cfg = configparser.ConfigParser()
            parsed_computing_cfg.read(computing_cfg_file)
            computing_cfg = dict(parsed_computing_cfg.items('computing'))

            #read the cluster file
            machines = dict()
            machines['worker'] = []
            machines['ps'] = []
            with open(computing_cfg['clusterfile']) as fid:
                for line in fid:
                    if line.strip():
                        split = line.strip().split(',')
                        hostip = socket.gethostbyname(split[1])
                        machines[split[0]].append(hostip)

            #create the outputs directory
            if os.path.isdir(os.path.join(expdir, 'cluster')):
                shutil.rmtree(os.path.join(expdir, 'cluster'))
            os.makedirs(os.path.join(expdir, 'cluster'))

            #run all the jobs
            processes = dict()
            processes['worker'] = []
            processes['ps'] = []
            for job in machines:
                task_index = 0
                for machine in machines[job]:
                    command = (
                        'python -u nabu/scripts/train.py '
                        '--clusterfile=%s '
                        '--job_name=%s --task_index=%d --ssh_command=%s '
                        '--expdir=%s') % (computing_cfg['clusterfile'], job,
                                          task_index,
                                          computing_cfg['ssh_command'], expdir)
                    processes[job].append(
                        run_remote.run_remote(command=command, host=machine))
                    task_index += 1

            #make sure the created processes are terminated at exit
            for job in processes:
                for process in processes[job]:
                    atexit.register(cond_term, process=process)

            #make sure all remotely created processes are terminated at exit
            atexit.register(kill_processes.kill_processes,
                            processdir=os.path.join(expdir, 'processes'))

            #wait for all worker processes to finish
            for process in processes['worker']:
                process.wait()

        else:
            raise Exception('unknown mode %s' % mode)

    elif computing == 'condor':

        if not os.path.isdir(os.path.join(expdir, 'outputs')):
            os.makedirs(os.path.join(expdir, 'outputs'))

        if mode == 'non_distributed':

            #read the computing config file
            parsed_computing_cfg = configparser.ConfigParser()
            parsed_computing_cfg.read(computing_cfg_file)
            computing_cfg = dict(parsed_computing_cfg.items('computing'))

            subprocess.call([
                'condor_submit',
                'expdir=%s' % expdir, 'script=nabu/scripts/train.py',
                'memory=%s' % computing_cfg['minmemory'],
                'nabu/computing/condor/non_distributed.job'
            ])

        elif mode == 'single_machine':

            #read the computing config file
            parsed_computing_cfg = configparser.ConfigParser()
            parsed_computing_cfg.read(computing_cfg_file)
            computing_cfg = dict(parsed_computing_cfg.items('computing'))

            if os.path.isdir(os.path.join(expdir, 'cluster')):
                shutil.rmtree(os.path.join(expdir, 'cluster'))
            os.makedirs(os.path.join(expdir, 'cluster'))

            #create the cluster file
            with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as fid:
                port = 1024
                for _ in range(int(computing_cfg['numps'])):
                    while not cluster.port_available(port):
                        port += 1
                    fid.write('ps,localhost,%d,\n' % port)
                    port += 1
                for i in range(int(computing_cfg['numworkers'])):
                    while not cluster.port_available(port):
                        port += 1
                    fid.write('worker,localhost,%d,%d\n' % (port, i))
                    port += 1

            #submit the job
            subprocess.call([
                'condor_submit',
                'expdir=%s' % expdir,
                'GPUs=%d' % (int(computing_cfg['numworkers'])),
                'memory=%s' % computing_cfg['minmemory'],
                'nabu/computing/condor/local.job'
            ])

            print('job submitted look in %s/outputs for the job outputs' %
                  expdir)

        elif mode == 'multi_machine':

            #read the computing config file
            parsed_computing_cfg = configparser.ConfigParser()
            parsed_computing_cfg.read(computing_cfg_file)
            computing_cfg = dict(parsed_computing_cfg.items('computing'))

            if os.path.isdir(os.path.join(expdir, 'cluster')):
                shutil.rmtree(os.path.join(expdir, 'cluster'))
            os.makedirs(os.path.join(expdir, 'cluster'))

            #submit the parameter server jobs
            subprocess.call([
                'condor_submit',
                'expdir=%s' % expdir,
                'numjobs=%s' % computing_cfg['numps'],
                'ssh_command=%s' % computing_cfg['ssh_command'],
                'nabu/computing/condor/ps.job'
            ])

            #submit the worker jobs
            subprocess.call([
                'condor_submit',
                'expdir=%s' % expdir,
                'numjobs=%s' % computing_cfg['numworkers'],
                'memory=%s' % computing_cfg['minmemory'],
                'ssh_command=%s' % computing_cfg['ssh_command'],
                'nabu/computing/condor/worker.job'
            ])

            ready = False

            try:
                print 'waiting for the machines to report...'
                numworkers = 0
                numps = 0
                while not ready:
                    #check the machines in the cluster
                    machines = cluster.get_machines(
                        os.path.join(expdir, 'cluster'))

                    if (len(machines['ps']) > numps
                            or len(machines['worker']) > numworkers):

                        numworkers = len(machines['worker'])
                        numps = len(machines['ps'])

                        print('parameter servers ready %d/%s' %
                              (len(machines['ps']), computing_cfg['numps']))

                        print('workers ready %d/%s' % (len(
                            machines['worker']), computing_cfg['numworkers']))

                        print 'press Ctrl-C to run with the current machines'

                    #check if the required amount of machines has reported
                    if (len(machines['worker']) == int(
                            computing_cfg['numworkers'])
                            and len(machines['ps']) == int(
                                computing_cfg['numps'])):

                        ready = True

                    sleep(1)

            except KeyboardInterrupt:

                #remove all jobs that are not running
                os.system('condor_rm -constraint \'JobStatus =!= 2\'')

                #check if enough machines are available
                if not machines['worker'] or not machines['ps']:

                    #stop the ps jobs
                    cidfile = os.path.join(expdir, 'cluster', 'ps-cid')
                    if os.path.exists(cidfile):
                        with open(cidfile) as fid:
                            cid = fid.read()
                        subprocess.call(['condor_rm', cid])

                    #stop the worker jobs
                    cidfile = os.path.join(expdir, 'cluster', 'worker-cid')
                    if os.path.exists(cidfile):
                        with open(cidfile) as fid:
                            cid = fid.read()
                        subprocess.call(['condor_rm', cid])

                    raise Exception('at leat one ps and one worker needed')

            print(
                'starting training with %s parameter servers and %s workers' %
                (len(machines['ps']), len(machines['worker'])))

            #create the cluster file
            with open(os.path.join(expdir, 'cluster', 'cluster'), 'w') as cfid:
                for job in machines:
                    if job == 'ps':
                        GPU = ''
                    else:
                        GPU = '0'
                    for machine in machines[job]:
                        cfid.write('%s,%s,%d,%s\n' %
                                   (job, machine[0], machine[1], GPU))

            #notify the machine that the cluster is ready
            open(os.path.join(expdir, 'cluster', 'ready'), 'w').close()

            print(
                'training has started look in %s/outputs for the job outputs' %
                expdir)

        else:
            raise Exception('unknown mode %s' % mode)
    else:
        raise Exception('Unknown computing type %s' % computing)
예제 #3
0
def main(expdir, recipe, mode, computing, resume, duplicates):
    '''main function'''

    if expdir is None:
        raise Exception('no expdir specified. Command usage: '
                        'nabu data --expdir=/path/to/recipe '
                        '--recipe=/path/to/recipe')

    if recipe is None:
        raise Exception('no recipe specified. Command usage: '
                        'nabu data --expdir=/path/to/recipe '
                        '--recipe=/path/to/recipe')

    if not os.path.isdir(recipe):
        raise Exception('cannot find recipe %s' % recipe)
    if mode not in ['non_distributed', 'single_machine', 'multi_machine']:
        raise Exception('unknown distributed mode: %s' % mode)
    if computing not in ['standard', 'condor']:
        raise Exception('unknown computing mode: %s' % computing)

    duplicates = int(duplicates)

    database_cfg_file = os.path.join(recipe, 'database.conf')
    model_cfg_file = os.path.join(recipe, 'model.cfg')
    trainer_cfg_file = os.path.join(recipe, 'trainer.cfg')
    evaluator_cfg_file = os.path.join(recipe, 'validation_evaluator.cfg')

    #read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(trainer_cfg_file)
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    for dupl_ind in range(duplicates):
        if duplicates > 1:
            expdir_run = expdir + '_dupl%i' % (dupl_ind)
        else:
            expdir_run = expdir

        if os.path.isdir(os.path.join(expdir_run, 'processes')):
            shutil.rmtree(os.path.join(expdir_run, 'processes'))
        os.makedirs(os.path.join(expdir_run, 'processes'))

        if resume == 'True':
            if not os.path.isdir(expdir_run):
                raise Exception(
                    'cannot find %s, please set resume to '
                    'False if you want to start a new training process' %
                    expdir_run)
        else:
            if os.path.isdir(os.path.join(expdir_run, 'logdir')):
                shutil.rmtree(os.path.join(expdir_run, 'logdir'))
            if not os.path.isdir(expdir_run):
                os.makedirs(expdir_run)
            if os.path.isdir(os.path.join(expdir_run, 'model')):
                shutil.rmtree(os.path.join(expdir_run, 'model'))
            os.makedirs(os.path.join(expdir_run, 'model'))

            if 'segment_lengths' in trainer_cfg:
                #create a separate directory for each training stage
                segment_lengths = trainer_cfg['segment_lengths'].split(' ')
                for seg_length in segment_lengths:
                    seg_expdir_run = os.path.join(expdir_run, seg_length)

                    if os.path.isdir(os.path.join(seg_expdir_run, 'logdir')):
                        shutil.rmtree(os.path.join(seg_expdir_run, 'logdir'))
                    if not os.path.isdir(seg_expdir_run):
                        os.makedirs(seg_expdir_run)
                    if os.path.isdir(os.path.join(seg_expdir_run, 'model')):
                        shutil.rmtree(os.path.join(seg_expdir_run, 'model'))
                    os.makedirs(os.path.join(seg_expdir_run, 'model'))

            #copy the configs to the expdir_run so they can be read there and the
            #experiment information is stored

            shutil.copyfile(database_cfg_file,
                            os.path.join(expdir_run, 'database.cfg'))
            shutil.copyfile(model_cfg_file,
                            os.path.join(expdir_run, 'model.cfg'))
            shutil.copyfile(evaluator_cfg_file,
                            os.path.join(expdir_run, 'evaluator.cfg'))
            shutil.copyfile(trainer_cfg_file,
                            os.path.join(expdir_run, 'trainer.cfg'))

            if 'segment_lengths' in trainer_cfg:
                #create designated database and trainer config files for each training stage

                batch_size_perseg = trainer_cfg['batch_size'].split(' ')
                numbatches_to_aggregate_perseg = trainer_cfg[
                    'numbatches_to_aggregate'].split(' ')
                initial_learning_rate_perseg = trainer_cfg[
                    'initial_learning_rate'].split(' ')
                learning_rate_decay_perseg = trainer_cfg[
                    'learning_rate_decay'].split(' ')
                if len(learning_rate_decay_perseg) == 1:
                    learning_rate_decay_perseg = learning_rate_decay_perseg * len(
                        segment_lengths)

                parsed_database_cfg = configparser.ConfigParser()
                parsed_database_cfg.read(database_cfg_file)

                for i, seg_length in enumerate(segment_lengths):
                    seg_expdir_run = os.path.join(expdir_run, seg_length)

                    segment_parsed_trainer_cfg = configparser.ConfigParser()
                    segment_parsed_trainer_cfg.read(trainer_cfg_file)
                    segment_parsed_trainer_cfg.set('trainer', 'batch_size',
                                                   batch_size_perseg[i])
                    segment_parsed_trainer_cfg.set(
                        'trainer', 'numbatches_to_aggregate',
                        numbatches_to_aggregate_perseg[i])
                    segment_parsed_trainer_cfg.set(
                        'trainer', 'initial_learning_rate',
                        initial_learning_rate_perseg[i])
                    segment_parsed_trainer_cfg.set(
                        'trainer', 'learning_rate_decay',
                        learning_rate_decay_perseg[i])
                    with open(os.path.join(seg_expdir_run, 'trainer.cfg'),
                              'w') as fid:
                        segment_parsed_trainer_cfg.write(fid)

                    segment_parsed_database_cfg = configparser.ConfigParser()
                    segment_parsed_database_cfg.read(database_cfg_file)

                    for section in segment_parsed_database_cfg.sections():
                        if 'store_dir' in dict(
                                segment_parsed_database_cfg.items(
                                    section)).keys():
                            segment_parsed_database_cfg.set(
                                section, 'store_dir',
                                os.path.join(
                                    segment_parsed_database_cfg.get(
                                        section, 'store_dir'), seg_length))
                    with open(os.path.join(seg_expdir_run, 'database.cfg'),
                              'w') as fid:
                        segment_parsed_database_cfg.write(fid)

        computing_cfg_file = 'config/computing/%s/%s.cfg' % (computing, mode)

        if computing == 'standard':

            if mode == 'non_distributed':
                #manualy set for machine
                os.environ['CUDA_VISIBLE_DEVICES'] = '0'

                train(clusterfile=None,
                      job_name='local',
                      task_index=0,
                      ssh_command='None',
                      expdir=expdir_run)

            elif mode == 'single_machine':
                #read the computing config file
                parsed_computing_cfg = configparser.ConfigParser()
                parsed_computing_cfg.read(computing_cfg_file)
                computing_cfg = dict(parsed_computing_cfg.items('computing'))

                #create the directories
                if os.path.isdir(os.path.join(expdir_run, 'cluster')):
                    shutil.rmtree(os.path.join(expdir_run, 'cluster'))
                os.makedirs(os.path.join(expdir_run, 'cluster'))

                GPUs = computing_cfg['gpus'].split(' ')

                #create the cluster file
                with open(os.path.join(expdir_run, 'cluster', 'cluster'),
                          'w') as fid:
                    port = 1024
                    for _ in range(int(computing_cfg['numps'])):
                        while not cluster.port_available(port):
                            port += 1
                        fid.write('ps,localhost,%d,\n' % port)
                        port += 1
                    for i in range(int(computing_cfg['numworkers'])):
                        while not cluster.port_available(port):
                            port += 1
                        fid.write('worker,localhost,%d,%s\n' % (port, GPUs[i]))
                        port += 1

                #start the training
                local_cluster.local_cluster(expdir_run)

            elif mode == 'multi_machine':

                #read the computing config file
                parsed_computing_cfg = configparser.ConfigParser()
                parsed_computing_cfg.read(computing_cfg_file)
                computing_cfg = dict(parsed_computing_cfg.items('computing'))

                #read the cluster file
                machines = dict()
                machines['worker'] = []
                machines['ps'] = []
                with open(computing_cfg['clusterfile']) as fid:
                    for line in fid:
                        if line.strip():
                            split = line.strip().split(',')
                            hostip = socket.gethostbyname(split[1])
                            machines[split[0]].append(hostip)

                #create the outputs directory
                if os.path.isdir(os.path.join(expdir_run, 'cluster')):
                    shutil.rmtree(os.path.join(expdir_run, 'cluster'))
                os.makedirs(os.path.join(expdir_run, 'cluster'))

                #run all the jobs
                processes = dict()
                processes['worker'] = []
                processes['ps'] = []
                for job in machines:
                    task_index = 0
                    for machine in machines[job]:
                        command = (
                            'python -u nabu/scripts/train.py '
                            '--clusterfile=%s '
                            '--job_name=%s --task_index=%d --ssh_command=%s '
                            '--expdir=%s') % (
                                computing_cfg['clusterfile'], job, task_index,
                                computing_cfg['ssh_command'], expdir_run)
                        processes[job].append(
                            run_remote.run_remote(command=command,
                                                  host=machine))
                        task_index += 1

                #make sure the created processes are terminated at exit
                for job in processes:
                    for process in processes[job]:
                        atexit.register(cond_term, process=process)

                #make sure all remotely created processes are terminated at exit
                atexit.register(kill_processes.kill_processes,
                                processdir=os.path.join(
                                    expdir_run, 'processes'))

                #wait for all worker processes to finish
                for process in processes['worker']:
                    process.wait()

            else:
                raise Exception('unknown mode %s' % mode)

        elif computing == 'condor':

            if not os.path.isdir(os.path.join(expdir_run, 'outputs')):
                os.makedirs(os.path.join(expdir_run, 'outputs'))

            if mode == 'non_distributed':

                #read the computing config file
                parsed_computing_cfg = configparser.ConfigParser()
                parsed_computing_cfg.read(computing_cfg_file)
                computing_cfg = dict(parsed_computing_cfg.items('computing'))

                subprocess.call([
                    'condor_submit',
                    'expdir=%s' % expdir_run, 'script=nabu/scripts/train.py',
                    'memory=%s' % computing_cfg['minmemory'],
                    'nabu/computing/condor/non_distributed.job'
                ])

            elif mode == 'single_machine':

                #read the computing config file
                parsed_computing_cfg = configparser.ConfigParser()
                parsed_computing_cfg.read(computing_cfg_file)
                computing_cfg = dict(parsed_computing_cfg.items('computing'))

                if os.path.isdir(os.path.join(expdir_run, 'cluster')):
                    shutil.rmtree(os.path.join(expdir_run, 'cluster'))
                os.makedirs(os.path.join(expdir_run, 'cluster'))

                #create the cluster file
                with open(os.path.join(expdir_run, 'cluster', 'cluster'),
                          'w') as fid:
                    port = 1024
                    for _ in range(int(computing_cfg['numps'])):
                        while not cluster.port_available(port):
                            port += 1
                        fid.write('ps,localhost,%d,\n' % port)
                        port += 1
                    for i in range(int(computing_cfg['numworkers'])):
                        while not cluster.port_available(port):
                            port += 1
                        fid.write('worker,localhost,%d,%d\n' % (port, i))
                        port += 1

                #submit the job
                subprocess.call([
                    'condor_submit',
                    'expdir=%s' % expdir_run,
                    'GPUs=%d' % (int(computing_cfg['numworkers'])),
                    'memory=%s' % computing_cfg['minmemory'],
                    'nabu/computing/condor/local.job'
                ])

                print('job submitted look in %s/outputs for the job outputs' %
                      expdir_run)

            elif mode == 'multi_machine':

                #read the computing config file
                parsed_computing_cfg = configparser.ConfigParser()
                parsed_computing_cfg.read(computing_cfg_file)
                computing_cfg = dict(parsed_computing_cfg.items('computing'))

                if os.path.isdir(os.path.join(expdir_run, 'cluster')):
                    shutil.rmtree(os.path.join(expdir_run, 'cluster'))
                os.makedirs(os.path.join(expdir_run, 'cluster'))

                #submit the parameter server jobs
                subprocess.call([
                    'condor_submit',
                    'expdir=%s' % expdir_run,
                    'numjobs=%s' % computing_cfg['numps'],
                    'ssh_command=%s' % computing_cfg['ssh_command'],
                    'nabu/computing/condor/ps.job'
                ])

                #submit the worker jobs
                subprocess.call([
                    'condor_submit',
                    'expdir=%s' % expdir_run,
                    'numjobs=%s' % computing_cfg['numworkers'],
                    'memory=%s' % computing_cfg['minmemory'],
                    'ssh_command=%s' % computing_cfg['ssh_command'],
                    'nabu/computing/condor/worker.job'
                ])

                ready = False

                try:
                    print 'waiting for the machines to report...'
                    numworkers = 0
                    numps = 0
                    while not ready:
                        #check the machines in the cluster
                        machines = cluster.get_machines(
                            os.path.join(expdir_run, 'cluster'))

                        if (len(machines['ps']) > numps
                                or len(machines['worker']) > numworkers):

                            numworkers = len(machines['worker'])
                            numps = len(machines['ps'])

                            print(
                                'parameter servers ready %d/%s' %
                                (len(machines['ps']), computing_cfg['numps']))

                            print('workers ready %d/%s' %
                                  (len(machines['worker']),
                                   computing_cfg['numworkers']))

                            print 'press Ctrl-C to run with the current machines'

                        #check if the required amount of machines has reported
                        if (len(machines['worker']) == int(
                                computing_cfg['numworkers'])
                                and len(machines['ps']) == int(
                                    computing_cfg['numps'])):

                            ready = True

                        sleep(1)

                except KeyboardInterrupt:

                    #remove all jobs that are not running
                    os.system('condor_rm -constraint \'JobStatus =!= 2\'')

                    #check if enough machines are available
                    if not machines['worker'] or not machines['ps']:

                        #stop the ps jobs
                        cidfile = os.path.join(expdir_run, 'cluster', 'ps-cid')
                        if os.path.exists(cidfile):
                            with open(cidfile) as fid:
                                cid = fid.read()
                            subprocess.call(['condor_rm', cid])

                        #stop the worker jobs
                        cidfile = os.path.join(expdir_run, 'cluster',
                                               'worker-cid')
                        if os.path.exists(cidfile):
                            with open(cidfile) as fid:
                                cid = fid.read()
                            subprocess.call(['condor_rm', cid])

                        raise Exception('at leat one ps and one worker needed')

                print(
                    'starting training with %s parameter servers and %s workers'
                    % (len(machines['ps']), len(machines['worker'])))

                #create the cluster file
                with open(os.path.join(expdir_run, 'cluster', 'cluster'),
                          'w') as cfid:
                    for job in machines:
                        if job == 'ps':
                            GPU = ''
                        else:
                            GPU = '0'
                        for machine in machines[job]:
                            cfid.write('%s,%s,%d,%s\n' %
                                       (job, machine[0], machine[1], GPU))

                #notify the machine that the cluster is ready
                open(os.path.join(expdir_run, 'cluster', 'ready'), 'w').close()

                print(
                    'training has started look in %s/outputs for the job outputs'
                    % expdir_run)

            else:
                raise Exception('unknown mode %s' % mode)
        else:
            raise Exception('Unknown computing type %s' % computing)