def main(job_id, params): job_id = str(job_id) command = [] print(params) my_params={} # Encode params to pass it through the run_command for key in params.keys(): my_params[key.replace('"','\'')] = int(params[key]) my_params = json.dumps(my_params) dico_to_save = {} dico_to_save['job']= job_id dico_to_save['params'] = my_params name= create_name(my_params) done_jobs = glob.glob('/home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output_0/_*') trained_jobs = [job.split('/')[-1] for job in done_jobs] if name not in trained_jobs: file_to_send = PATH_TO_HACONE + "/jobs/job{}.txt".format(name) file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id) with open(file_to_send, 'wb') as fp: json.dump(dico_to_save, fp) # now we need to send this file : job_file to the AWS instance. subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format(private_key, file_to_send, public_DNS, job_id), shell=True) command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH") # Launch the train and eval script command.append("./train_eval_image_classifier_bis.py 1 {} --max_number_of_steps=28125".format(job_id)) print(command) command_id = ssm.run_command(instance_id,command) print('Training launched') listen_to_remote_process(file_to_get_back) print('Training finished') subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output/{}/".format(private_key, public_DNS, job_id, PATH_TO_HACONE, name), shell=True) subprocess.call("scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt".format(private_key, public_DNS, job_id,PATH_TO_HACONE, name), shell=True) output_dir = os.path.join('outputs',str(job_id)) with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/accuracy.txt'.format(name)) as my_file: dico = json.load(my_file) else: with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'.format(name)) as my_file: dico = json.load(my_file) subprocess.call("scp {} {}@{}:/home/data/{}/jobs/".format(file_to_send, user, server_DNS,user),shell=True) subprocess.call("ssh {}@{} './monitor_inference_4_GPU.py {} {}\'".format(user, server_DNS, name, job_id), shell=True) subprocess.call("scp {}@{}:/home/data/arturo/hardware_metrics/hardware_metrics_{}.txt {}/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt ".format(user, server_DNS, job_id, PATH_TO_HACONE,name), shell=True) with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name)) as my_file: dico_hardware = json.load(my_file) for key, value in dico.items(): dico_hardware["{}".format(key)] = value with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/{}/hardware_metrics.txt'.format(name), 'w') as my_file: json.dump(dico_hardware, my_file) return dico_hardware
def main(job_id, params): #ssm.purge_queue() job_id = str(job_id) command = [] print(params) my_params = {} #Encode params to pass it through the run_command for key in params.keys(): my_params[key.replace('"', '\'')] = int(params[key]) my_params = json.dumps(my_params) command.append( "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH") #Launch the train and eval script command.append( "./train_eval_image_classifier.py --job_id={} --params='{}'".format( job_id, my_params)) print(command) command_id = ssm.run_command(instance_id, command) dic = {} while True: dic = ssm.notif_listener(dic) print('\nGlobal dic : ', dic, '\n') if dic[command_id] != 'InProgress': print(dic[command_id]) command = [] if dic[command_id] == 'Success': output_dir = os.path.join('outputs', str(job_id)) #go to the output dir and copy everything to a bucket, then remove everything from the instance command.append('cd {}'.format(output_dir)) command.append( 'aws s3 cp ./gpu_0 s3://astar-trainedmodels/{}/{}/awsrunShellScript/0.awsrunShellScript/ --recursive' .format(command_id, instance_id)) command.append('rm -rf ./gpu_0') command_id = ssm.run_command(instance_id, command, save_bucket=False) break #f = buckets.download_from_s3('astar_trainedmodels') return f
def main(job_id, params): job_id = str(job_id) command = [] print(params) my_params = {} # Encode params to pass it through the run_command for key in params.keys(): my_params[key.replace('"', '\'')] = int(params[key]) my_params = json.dumps(my_params) dico_to_save = {} dico_to_save['platform'] = 'Phone' dico_to_save['job'] = job_id dico_to_save['params'] = my_params name = create_name(my_params) if debug: max_number_of_steps = 100 else: max_number_of_steps = 28125 if not is_already_trained(name): file_to_send = "{}/jobs/job{}.txt".format(PATH_TO_HACONE, name) file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id) with open(file_to_send, 'wb') as fp: json.dump(dico_to_save, fp) #check if job number X isn't already stored on the instance remote_output_dir = '/home/ubuntu/outputs/cifar10_nns/{}'.format( job_id) if exists_remote(remote_output_dir): ssm.run_command(instance_id, ['sudo rm -rf ' + remote_output_dir]) # now we need to send this file : job_file to the AWS instance. subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format( private_key, file_to_send, public_DNS, job_id), shell=True) command.append( "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH" ) # Launch the train and eval script command.append( "./train_eval_image_classifier_bis.py {} {} --max_number_of_steps={} --num_clones={}" .format(gpus_to_use, job_id, max_number_of_steps, nb_gpus_to_use)) print(command) command_id = ssm.run_command(instance_id, command) print('Training launched') listen_to_remote_process(file_to_get_back) print('Training finished') #Get back the accuracy and checkpoints of the model subprocess.call( "scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output_0/{}/" .format(private_key, public_DNS, job_id, PATH_TO_HACONE, name), shell=True) subprocess.call( "scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt" .format(private_key, public_DNS, job_id, PATH_TO_HACONE, name), shell=True) subprocess.call( "scp -r {0}/Spearmint-PESM/examples/cifar10/output_0/{1} [email protected]:/home/data/arturo/models_trained/{1}" .format(PATH_TO_HACONE, name), shell=True) output_dir = os.path.join('outputs', str(job_id)) with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'. format(name)) as my_file: dico = json.load(my_file) else: print("Model already trained") with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'. format(name)) as my_file: dico = json.load(my_file) #Export the graph to a protobuf file cmd = "python {0}/tensorflow/nn_search/export_inference_graph_movidius.py \ --job_name=cifar10_phone \ --name_job={1} \ --output_file={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/inference_graph.pb \ --PATH_TO_HACONE={0}".format(PATH_TO_HACONE, name) cmd_s = '#!/bin/bash\n(flock -w 3600 9 || exit 1; {}) 9>/var/lock/gpu'.format( cmd) #Allow us to execute only one command on the GPU at a time command_file = os.path.join(PATH_TO_HACONE, 'Spearmint-PESM', 'examples', 'cifar10', 'command_files', 'pb_' + name + '.sh') with open(command_file, 'w') as f: f.write(cmd_s) subprocess.call('sh {0}; rm {0}'.format(command_file), shell=True) #Freeze the weights in the graph in a protobuf file cmd = "python {0}/tensorflow/nn_search/freeze_graph_16.py \ --input_graph={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/inference_graph.pb \ --input_checkpoint={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/model.ckpt-{2} \ --input_binary=true \ --output_graph={0}/Spearmint-PESM/examples/cifar10/output_0/{1}/{1}.pb \ --output_node_names=CifarNet/Predictions/Reshape_1".format( PATH_TO_HACONE, name, max_number_of_steps) cmd_s = '#!/bin/bash\n(flock -w 3600 9 || exit 1; {}) 9>/var/lock/gpu'.format( cmd) #Allow us to execute only one command on the GPU at a time command_file = os.path.join(PATH_TO_HACONE, 'Spearmint-PESM', 'examples', 'cifar10', 'command_files', 'fr_' + name + '.sh') with open(command_file, 'w') as f: f.write(cmd_s) subprocess.call('sh {0}; rm {0}'.format(command_file), shell=True) #Measure power and inference time on the phone subprocess.call( "cp {0}/Spearmint-PESM/examples/cifar10/output_0/{1}/{1}.pb {0}/snpe-sdk/models/cifarnet/tensorflow/" .format(PATH_TO_HACONE, name), shell=True) subprocess.call( "cd {0}/snpe-sdk; python ./models/cifarnet/scripts/setup_cifarnet.py -S {0}/snpe-sdk -A {3} -t {4} -a ./models/cifarnet/data -f {1} {2}" .format(PATH_TO_HACONE, name, '-d' * debug, ANDROID_NDK_ROOT, TENSORFLOW_HOME), shell=True) dico_hardware = {} #Retrieve inference time in us stats_file = '{}/snpe-sdk/benchmarks/cifarnet/benchmark/{}/latest_results/benchmark_stats_CifarNet.csv'.format( PATH_TO_HACONE, name) timer = 0 while not os.path.exists(stats_file) and timer < 120: time.sleep(30) timer += 1 with open(stats_file) as fp: reader = csv.reader(fp, delimiter=',') for row in reader: if 'Total Inference Time' in row: inference_time_us = float(row[3]) #in micro-seconds dico_hardware[ 'time'] = inference_time_us / 1000000 #in seconds elif 'energy [J]' in row: energy_joules = float(row[3]) dico_hardware['power'] = energy_joules #Retrieve accuracy with open('{}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt'. format(PATH_TO_HACONE, name)) as acc: dico = json.load(acc) dico_hardware['f'] = dico['f'] with open( PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output_0/{}/hardware_metrics.txt' .format(name), 'w') as my_file: json.dump(dico_hardware, my_file) return dico_hardware
def train_on_cloud(name, job_id, public_DNS, instance_id, nb_gpus_to_use, gpus_to_use, max_number_of_steps, dico_to_save, ID_NUMBER): ''' Train the model on the cloud. Input: - name: String, name of the model (ex: _0_-2_2_3_0_2_3_2_-2_-1_1_4_1_0_5_3_4_4_0_3) - job_id: id of the current job - public_DNS: String, DNS of the AWS instance - instance_id: String, id of the AWS instance - nb_gpus_to_use: int, number of GPUs to use for the training of the model - gpus_to_use: String, ids of the GPUs to use - max_number_of_steps: int, number of steps the model need to train - dico_to_save: String, path to the file containing the job_id and the parameters - ID_NUMBER: String, id of the set of GPUs to use to train the model ''' command = [] if not is_already_trained(name): file_to_send = "{}/jobs/job{}.txt".format(cfg.PATH_TO_HACONE, name) file_to_get_back = '/home/ubuntu/accuracy_{}.txt'.format(job_id) with open(file_to_send, 'wb') as fp: json.dump(dico_to_save, fp) #check if job number X isn't already stored on the instance remote_output_dir = '/home/ubuntu/outputs/cifar10_nns/{}'.format( job_id) if exists_remote(remote_output_dir, public_DNS): ssm.run_command(instance_id, ['sudo rm -rf ' + remote_output_dir]) # now we need to send this file : job_file to the AWS instance. subprocess.call("scp -i {} {} ubuntu@{}:~/job_file_{}.txt".format( cfg.private_key, file_to_send, public_DNS, job_id), shell=True) command.append( "export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH" ) # Launch the train and eval script command.append( "./train_eval_image_classifier_bis.py {} {} --max_number_of_steps={} --num_clones={}" .format(gpus_to_use, job_id, max_number_of_steps, nb_gpus_to_use)) print(command) #Run the command to launch the training and waiting for it to finish lock_instance_gpu( command, 'lock_instance' + name, file_to_get_back, ID_NUMBER, instance_id, public_DNS) #Allow us to be the only one to use the instance gpu #Get back the accuracy and checkpoints of the model subprocess.call( "scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ {}/Spearmint-PESM/examples/cifar10/output_0/{}/" .format(cfg.private_key, public_DNS, job_id, cfg.PATH_TO_HACONE, name), shell=True) subprocess.call( "scp -i {} ubuntu@{}:~/accuracy_{}.txt {}/Spearmint-PESM/examples/cifar10/output_0/{}/accuracy.txt" .format(cfg.private_key, public_DNS, job_id, cfg.PATH_TO_HACONE, name), shell=True) #Send the checkpoints and the accuracy on the server if not cfg.debug: subprocess.call( "scp -r {0}/Spearmint-PESM/examples/cifar10/output_0/{1} {2}:{3}/{1}" .format(cfg.PATH_TO_HACONE, name, cfg.server_DNS_where_models_are_stored, cfg.server_dir_models), shell=True) else: print("Model already trained") subprocess.call( "scp -r {2}:{3}/{1} {0}/Spearmint-PESM/examples/cifar10/output_0/{1}" .format(cfg.PATH_TO_HACONE, name, cfg.server_DNS_where_models_are_stored, cfg.server_dir_models), shell=True)
def main(job_id, params): job_id = str(job_id) command = [] print(params) my_params={} # Encode params to pass it through the run_command for key in params.keys(): my_params[key.replace('"','\'')] = int(params[key]) my_params = json.dumps(my_params) dico_to_save = {} dico_to_save['job']= job_id dico_to_save['params'] = my_params file_to_send = PATH_TO_HACONE + "/jobs/job_file_{}.txt".format(job_id) with open(file_to_send, 'wb') as fp: json.dump(dico_to_save, fp) # now we need to send this file : job_file to the AWS instance. subprocess.call("scp -i {} {} ubuntu@{}:~/job_file.txt".format(private_key, file_to_send, public_DNS), shell=True) command.append("export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH") # Launch the train and eval script command.append("./train_eval_image_classifier_bis.py") print(command) command_id = ssm.run_command(instance_id,command) dic = {} while True: dic = ssm.notif_listener(dic); print('\n Global dic : ',dic,'\n') if dic[command_id]!='InProgress': print(dic[command_id]) command_2 = [] if dic[command_id]=='Success': subprocess.call("scp -i {} -r ubuntu@{}:~/outputs/cifar10_nns/{}/ /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/".format(private_key, public_DNS, job_id), shell=True) subprocess.call("scp -i {} ubuntu@{}:~/accuracy.txt /home/arthur/Documents/hacone/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt".format(private_key, public_DNS, job_id), shell=True) output_dir = os.path.join('outputs',str(job_id)) #go to the output dir and copy everything to a bucket, then remove everything from the instance command_2.append('cd {}'.format(output_dir)) command_2.append('aws s3 cp ./{} s3://astar-trainedmodels/{}/{}/awsrunShellScript/0.awsrunShellScript/ --recursive'.format(output_dir,command_id,instance_id)) command_2.append('rm -rf ./{}'.format(output_dir)) command_id_2 = ssm.run_command(instance_id,command_2,save_bucket=False) while True: dic = ssm.notif_listener(dic) if dic[command_id_2]!='InProgress': print(dic[command_id_2]) dic.pop(command_id_2,None) break dirname = buckets.download_from_s3('astar-trainedmodels') dic.pop(command_id,None) with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/accuracy_{}.txt'.format(job_id)) as my_file: dico = json.load(my_file) break dic = ssm.notif_listener(dic) #Now it's time to find the hardware characterictics : time, memory, power. subprocess.call(PATH_TO_HACONE + "/monitoring_GPU/monitor_inference.py {}".format(job_id), shell=True) with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id)) as my_file: dico_hardware = json.load(my_file) for key, value in dico.items(): dico_hardware["{}".format(key)] = value with open(PATH_TO_HACONE + '/Spearmint-PESM/examples/cifar10/output/hardware_metrics_{}.txt'.format(job_id), 'w') as my_file: json.dump(dico_hardware, my_file) return dico_hardware