Пример #1
0
def main(job_id, params):
    job_id = str(job_id)
    command = []
    print(params)
    my_params={}
    # Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"','\'')] = int(params[key])
    my_params = json.dumps(my_params)
    dico_to_save = {}
    dico_to_save['job']= job_id
    dico_to_save['params'] = my_params
    name= create_name(my_params)

    done_jobs = glob.glob('/home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output_0/_*')
    trained_jobs = [job.split('/')[-1] for job in done_jobs]
    if name not in trained_jobs:
        file_to_send = PATH_TO_HACONE + "/jobs/job{}.txt".format(name)
        file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id)
        with open(file_to_send, 'wb') as fp:
            json.dump(dico_to_save, fp)

        # now we need to send this file : job_file to the AWS instance.
        subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format(private_key, file_to_send, public_DNS, job_id), shell=True)

        command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH")
        # Launch the train and eval script

        command.append("./train_eval_image_classifier_bis.py 1 {} --max_number_of_steps=28125".format(job_id))
        print(command)

        command_id = ssm.run_command(instance_id,command)

        print('Training launched')
        listen_to_remote_process(file_to_get_back)
        print('Training finished')

        subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output/{}/".format(private_key, public_DNS, job_id, PATH_TO_HACONE, name), shell=True)
        subprocess.call("scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt".format(private_key, public_DNS, job_id,PATH_TO_HACONE, name), shell=True)
        output_dir = os.path.join('outputs',str(job_id))

        with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt'.format(name)) as my_file:
            dico = json.load(my_file)

    else:
        with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'.format(name)) as my_file:
            dico = json.load(my_file)

    subprocess.call("scp {} {}@{}:/home/data/{}/jobs/".format(file_to_send, user, server_DNS,user),shell=True)
    subprocess.call("ssh {}@{} './monitor_inference_4_GPU.py {} {}\'".format(user, server_DNS, name, job_id), shell=True)
    subprocess.call("scp {}@{}:/home/data/arturo/hardware_metrics/hardware_metrics_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt ".format(user, server_DNS, job_id, PATH_TO_HACONE,name), shell=True)
    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name)) as my_file:
        dico_hardware = json.load(my_file)
    for key, value in dico.items():
        dico_hardware["{}".format(key)] = value


    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name), 'w') as my_file:
        json.dump(dico_hardware, my_file)
    return dico_hardware
def main(job_id, params):
    #ssm.purge_queue()
    job_id = str(job_id)
    command = []
    print(params)
    my_params = {}
    #Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"', '\'')] = int(params[key])
    my_params = json.dumps(my_params)

    command.append(
        "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH")
    #Launch the train and eval script
    command.append(
        "./train_eval_image_classifier.py --job_id={} --params='{}'".format(
            job_id, my_params))
    print(command)

    command_id = ssm.run_command(instance_id, command)

    dic = {}

    while True:
        dic = ssm.notif_listener(dic)
        print('\nGlobal dic : ', dic, '\n')

        if dic[command_id] != 'InProgress':
            print(dic[command_id])
            command = []

            if dic[command_id] == 'Success':
                output_dir = os.path.join('outputs', str(job_id))

                #go to the output dir and copy everything to a bucket, then remove everything from the instance
                command.append('cd {}'.format(output_dir))
                command.append(
                    'aws s3 cp ./gpu_0 s3://astar-trainedmodels/{}/{}/awsrunShellScript/0.awsrunShellScript/ --recursive'
                    .format(command_id, instance_id))
                command.append('rm -rf ./gpu_0')
                command_id = ssm.run_command(instance_id,
                                             command,
                                             save_bucket=False)
            break

    #f = buckets.download_from_s3('astar_trainedmodels')
    return f
def main(job_id, params):
    job_id = str(job_id)
    command = []
    print(params)
    my_params = {}
    # Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"', '\'')] = int(params[key])
    my_params = json.dumps(my_params)
    dico_to_save = {}
    dico_to_save['platform'] = 'Phone'
    dico_to_save['job'] = job_id
    dico_to_save['params'] = my_params
    name = create_name(my_params)

    if debug:
        max_number_of_steps = 100
    else:
        max_number_of_steps = 28125

    if not is_already_trained(name):
        file_to_send = "{}/jobs/job{}.txt".format(PATH_TO_HACONE, name)
        file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id)
        with open(file_to_send, 'wb') as fp:
            json.dump(dico_to_save, fp)

        #check if job number X isn't already stored on the instance
        remote_output_dir = '/home/ubuntu/outputs/cifar10_nns/{}'.format(
            job_id)
        if exists_remote(remote_output_dir):
            ssm.run_command(instance_id, ['sudo rm -rf ' + remote_output_dir])

        # now we need to send this file : job_file to the AWS instance.
        subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format(
            private_key, file_to_send, public_DNS, job_id),
                        shell=True)
        command.append(
            "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH"
        )

        # Launch the train and eval script
        command.append(
            "./train_eval_image_classifier_bis.py {} {} --max_number_of_steps={} --num_clones={}"
            .format(gpus_to_use, job_id, max_number_of_steps, nb_gpus_to_use))
        print(command)

        command_id = ssm.run_command(instance_id, command)

        print('Training launched')
        listen_to_remote_process(file_to_get_back)
        print('Training finished')

        #Get back the accuracy and checkpoints of the model
        subprocess.call(
            "scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output_0/{}/"
            .format(private_key, public_DNS, job_id, PATH_TO_HACONE, name),
            shell=True)
        subprocess.call(
            "scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt"
            .format(private_key, public_DNS, job_id, PATH_TO_HACONE, name),
            shell=True)
        subprocess.call(
            "scp -r {0}/Spearmint-PESM/examples/cifar10/output_0/{1} [email protected]:/home/data/arturo/models_trained/{1}"
            .format(PATH_TO_HACONE, name),
            shell=True)
        output_dir = os.path.join('outputs', str(job_id))

        with open(PATH_TO_HACONE +
                  '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'.
                  format(name)) as my_file:
            dico = json.load(my_file)

    else:
        print("Model already trained")
        with open(PATH_TO_HACONE +
                  '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'.
                  format(name)) as my_file:
            dico = json.load(my_file)

    #Export the graph to a protobuf file
    cmd = "python {0}/tensorflow/nn_search/export_inference_graph_movidius.py \
    --job_name=cifar10_phone \
    --name_job={1} \
    --output_file={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/inference_graph.pb \
    --PATH_TO_HACONE={0}".format(PATH_TO_HACONE, name)

    cmd_s = '#!/bin/bash\n(flock -w 3600 9 || exit 1; {}) 9>/var/lock/gpu'.format(
        cmd)  #Allow us to execute only one command on the GPU at a time

    command_file = os.path.join(PATH_TO_HACONE, 'Spearmint-PESM', 'examples',
                                'cifar10', 'command_files',
                                'pb_' + name + '.sh')
    with open(command_file, 'w') as f:
        f.write(cmd_s)

    subprocess.call('sh {0}; rm {0}'.format(command_file), shell=True)

    #Freeze the weights in the graph in a protobuf file
    cmd = "python {0}/tensorflow/nn_search/freeze_graph_16.py \
    --input_graph={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/inference_graph.pb \
    --input_checkpoint={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/model.ckpt-{2} \
    --input_binary=true \
    --output_graph={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/{1}.pb \
    --output_node_names=CifarNet/Predictions/Reshape_1".format(
        PATH_TO_HACONE, name, max_number_of_steps)

    cmd_s = '#!/bin/bash\n(flock -w 3600 9 || exit 1; {}) 9>/var/lock/gpu'.format(
        cmd)  #Allow us to execute only one command on the GPU at a time

    command_file = os.path.join(PATH_TO_HACONE, 'Spearmint-PESM', 'examples',
                                'cifar10', 'command_files',
                                'fr_' + name + '.sh')
    with open(command_file, 'w') as f:
        f.write(cmd_s)

    subprocess.call('sh {0}; rm {0}'.format(command_file), shell=True)

    #Measure power and inference time on the phone
    subprocess.call(
        "cp {0}/Spearmint-PESM/examples/cifar10/output_0/{1}/{1}.pb {0}/snpe-sdk/models/cifarnet/tensorflow/"
        .format(PATH_TO_HACONE, name),
        shell=True)
    subprocess.call(
        "cd {0}/snpe-sdk; python ./models/cifarnet/scripts/setup_cifarnet.py -S {0}/snpe-sdk -A {3} -t {4} -a ./models/cifarnet/data -f {1} {2}"
        .format(PATH_TO_HACONE, name, '-d' * debug, ANDROID_NDK_ROOT,
                TENSORFLOW_HOME),
        shell=True)

    dico_hardware = {}

    #Retrieve inference time in us
    stats_file = '{}/snpe-sdk/benchmarks/cifarnet/benchmark/{}/latest_results/benchmark_stats_CifarNet.csv'.format(
        PATH_TO_HACONE, name)
    timer = 0
    while not os.path.exists(stats_file) and timer < 120:
        time.sleep(30)
        timer += 1

    with open(stats_file) as fp:
        reader = csv.reader(fp, delimiter=',')
        for row in reader:
            if 'Total Inference Time' in row:
                inference_time_us = float(row[3])  #in micro-seconds
                dico_hardware[
                    'time'] = inference_time_us / 1000000  #in seconds
            elif 'energy [J]' in row:
                energy_joules = float(row[3])
                dico_hardware['power'] = energy_joules

    #Retrieve accuracy
    with open('{}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'.
              format(PATH_TO_HACONE, name)) as acc:
        dico = json.load(acc)
        dico_hardware['f'] = dico['f']

    with open(
            PATH_TO_HACONE +
            '/Spearmint-PESM/examples/cifar10/output_0/{}/hardware_metrics.txt'
            .format(name), 'w') as my_file:
        json.dump(dico_hardware, my_file)
    return dico_hardware
Пример #4
0
def train_on_cloud(name, job_id, public_DNS, instance_id, nb_gpus_to_use,
                   gpus_to_use, max_number_of_steps, dico_to_save, ID_NUMBER):
    '''
    Train the model on the cloud.
    Input:
        - name: String, name of the model (ex: _0_-2_2_3_0_2_3_2_-2_-1_1_4_1_0_5_3_4_4_0_3)
        - job_id: id of the current job
        - public_DNS: String, DNS of the AWS instance
        - instance_id: String, id of the AWS instance
        - nb_gpus_to_use: int, number of GPUs to use for the training of the model
        - gpus_to_use: String, ids of the GPUs to use
        - max_number_of_steps: int, number of steps the model need to train
        - dico_to_save: String, path to the file containing the job_id and the parameters
        - ID_NUMBER: String, id of the set of GPUs to use to train the model
    '''
    command = []

    if not is_already_trained(name):
        file_to_send = "{}/jobs/job{}.txt".format(cfg.PATH_TO_HACONE, name)
        file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id)
        with open(file_to_send, 'wb') as fp:
            json.dump(dico_to_save, fp)

        #check if job number X isn't already stored on the instance
        remote_output_dir = '/home/ubuntu/outputs/cifar10_nns/{}'.format(
            job_id)
        if exists_remote(remote_output_dir, public_DNS):
            ssm.run_command(instance_id, ['sudo rm -rf ' + remote_output_dir])

        # now we need to send this file : job_file to the AWS instance.
        subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format(
            cfg.private_key, file_to_send, public_DNS, job_id),
                        shell=True)
        command.append(
            "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH"
        )

        # Launch the train and eval script
        command.append(
            "./train_eval_image_classifier_bis.py {} {} --max_number_of_steps={} --num_clones={}"
            .format(gpus_to_use, job_id, max_number_of_steps, nb_gpus_to_use))
        print(command)

        #Run the command to launch the training and waiting for it to finish
        lock_instance_gpu(
            command, 'lock_instance' + name, file_to_get_back, ID_NUMBER,
            instance_id,
            public_DNS)  #Allow us to be the only one to use the instance gpu

        #Get back the accuracy and checkpoints of the model
        subprocess.call(
            "scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output_0/{}/"
            .format(cfg.private_key, public_DNS, job_id, cfg.PATH_TO_HACONE,
                    name),
            shell=True)
        subprocess.call(
            "scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt"
            .format(cfg.private_key, public_DNS, job_id, cfg.PATH_TO_HACONE,
                    name),
            shell=True)

        #Send the checkpoints and the accuracy on the server
        if not cfg.debug:
            subprocess.call(
                "scp -r {0}/Spearmint-PESM/examples/cifar10/output_0/{1} {2}:{3}/{1}"
                .format(cfg.PATH_TO_HACONE, name,
                        cfg.server_DNS_where_models_are_stored,
                        cfg.server_dir_models),
                shell=True)

    else:
        print("Model already trained")
        subprocess.call(
            "scp -r {2}:{3}/{1} {0}/Spearmint-PESM/examples/cifar10/output_0/{1}"
            .format(cfg.PATH_TO_HACONE, name,
                    cfg.server_DNS_where_models_are_stored,
                    cfg.server_dir_models),
            shell=True)
def main(job_id, params):
    job_id = str(job_id)
    command = []
    print(params)
    my_params={}
    # Encode params to pass it through the run_command
    for key in params.keys():
        my_params[key.replace('"','\'')] = int(params[key])
    my_params = json.dumps(my_params)
    dico_to_save = {}
    dico_to_save['job']= job_id
    dico_to_save['params'] = my_params
    file_to_send = PATH_TO_HACONE + "/jobs/job_file_{}.txt".format(job_id)
    with open(file_to_send, 'wb') as fp:
        json.dump(dico_to_save, fp)

    # now we need to send this file : job_file to the AWS instance.
    subprocess.call("scp -i {} {} ubuntu@{}:~/job_file.txt".format(private_key, file_to_send, public_DNS), shell=True)

    command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH")
    # Launch the train and eval script

    command.append("./train_eval_image_classifier_bis.py")
    print(command)

    command_id = ssm.run_command(instance_id,command)

    dic = {}

    while True:
        dic = ssm.notif_listener(dic);
        print('\n Global dic : ',dic,'\n')

        if dic[command_id]!='InProgress':
            print(dic[command_id])
            command_2 = []

            if dic[command_id]=='Success':
                subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/".format(private_key, public_DNS, job_id), shell=True)
                subprocess.call("scp -i {} ubuntu@{}:~/accuracy.txt /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt".format(private_key, public_DNS, job_id), shell=True)
                output_dir = os.path.join('outputs',str(job_id))

                #go to the output dir and copy everything to a bucket, then remove everything from the instance
                command_2.append('cd {}'.format(output_dir))
                command_2.append('aws s3 cp ./{} s3://astar-trainedmodels/{}/{}/awsrunShellScript/0.awsrunShellScript/ --recursive'.format(output_dir,command_id,instance_id))
                command_2.append('rm -rf ./{}'.format(output_dir))
                command_id_2 = ssm.run_command(instance_id,command_2,save_bucket=False)

                while True:
                    dic = ssm.notif_listener(dic)
                    if dic[command_id_2]!='InProgress':
                        print(dic[command_id_2])
                        dic.pop(command_id_2,None)
                        break

            dirname = buckets.download_from_s3('astar-trainedmodels')
            dic.pop(command_id,None)
            with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt'.format(job_id)) as my_file:
                dico = json.load(my_file)
            break
    dic = ssm.notif_listener(dic)
    #Now it's time to find the hardware characterictics : time, memory, power.
    subprocess.call(PATH_TO_HACONE + "/monitoring_GPU/monitor_inference.py {}".format(job_id), shell=True)
    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id)) as my_file:
        dico_hardware = json.load(my_file)
    for key, value in dico.items():
        dico_hardware["{}".format(key)] = value


    with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id), 'w') as my_file:
        json.dump(dico_hardware, my_file)
    return dico_hardware