Exemplo n.º 1
0
def setup_train(args, machine_is_host=False):
    # set_up_gpu(args)
    exp_root, exp_group, exp_name = args.experiment_root, args.experiment_group, args.experiment_name
    assert exp_name is not None
    local_export_root = os.path.join(exp_root, exp_group, exp_name)

    if machine_is_host:
        remote_export_root = None
        communicator = None
        if os.path.exists(local_export_root):
            if exp_group == 'test':
                print('Removing local test export root {}'.format(
                    local_export_root))
                shutil.rmtree(local_export_root)
            else:
                print('Local export root exists {}'.format(local_export_root))
                exit(0)
        create_local_export_root(args, local_export_root)
        export_config(args, local_export_root)
        print('Export root', local_export_root)
    else:
        remote_export_root = os.path.join(REMOTE_ROOT, local_export_root)
        communicator = Communicator(HOST, PORT, USERNAME, PASSWORD)
        created = communicator.create_dir(remote_dir_path=remote_export_root)
        if not created:
            if os.path.exists(local_export_root):
                print('Local export root exists while checking status')
                exit(0)
            os.makedirs(local_export_root)
            local_status = os.path.join(local_export_root, 'status.txt')
            remote_status = os.path.join(remote_export_root, 'status.txt')
            try:
                communicator.sftp.get(remote_status, local_status)
                status = open(local_status).readline()
            except Exception:
                status = 'failed to download status'
            print('Checking status')
            if status == STATUS_RECOVERY:
                print('Status is recovery')
                with open(local_status, 'w') as f:
                    f.write('running\n')
                print("Write 'running' on remote")
                communicator.sftp.put(local_status, remote_status)
                print('Downloading remote export root')
                communicator.download_dir(remote_export_root,
                                          local_export_root)
                args.resume_training = True
            else:
                print('Status is not recovery')
                shutil.rmtree(local_export_root)
                print('Remote export root {} exists. Existing'.format(
                    remote_export_root))
                exit(0)
        else:
            print(
                'Created export_root={} in remote'.format(remote_export_root))
        create_local_export_root(args, local_export_root)
        export_config(args, local_export_root)
        print('Export root', local_export_root)
    return local_export_root, remote_export_root, communicator