def output_experiment_detail(self, res_path): exp = nni.get_experiment_id() trail = nni.get_trial_id() trail_path = os.path.join(res_path, exp, trail) if not os.path.exists(os.path.join(res_path, exp)): os.mkdir(os.path.join(res_path, exp)) if not os.path.exists(trail_path): os.mkdir(trail_path) # TODO whatever you want p_loss = self.plot_line(LOSS_PLOT, show_plot=False) p_acc = self.plot_line(ACCURACY_PLOT, show_plot=False) p_auc = self.plot_line(AUC_PLOT, show_plot=False) measures_table = [ ["train_loss_vec"] + [str(x) for x in self.loss_train_vec], ["train_acc_vec"] + [str(x) for x in self.accuracy_train_vec], ["train_auc_vec"] + [str(x) for x in self.auc_train_vec], ["dev_loss_vec"] + [str(x) for x in self.loss_dev_vec], ["dev_acc_vec"] + [str(x) for x in self.accuracy_dev_vec], ["dev_auc_vec"] + [str(x) for x in self.auc_dev_vec], ["test_loss_vec"] + [str(x) for x in self.loss_test_vec], ["test_acc_vec"] + [str(x) for x in self.accuracy_test_vec], ["test_auc_vec"] + [str(x) for x in self.auc_test_vec] ] with open(os.path.join(res_path, exp, trail, "measures_by_epochs.csv", "wt"), newline="") as f: writer = csv.writer(f) writer.writerows(measures_table)
def __init__(self, model_key, run_id, run_dir): self.model_key = model_key self.run_id = run_id self.run_dir = run_dir self.trial_id = nni.get_trial_id() self.exp_id = nni.get_experiment_id() self.scoring = accuracy_score
def get_nni_or_mlflow_experiment_and_trial() -> Tuple[Optional[str], Optional[str]]: """ Helper function which returns NNI experiment name and trial ID if NNI isn't in Standalone mode or, otherwise, returns MLFlow experiment name and run ID if there is an active MLFlow run. Returns (None, None) if NNI is in standalone mode and there is no active MLFLow run. """ if is_nni_run_standalone(): exp, run = deepcv.utils.mlflow_get_experiment_run_info() return (None, None) if exp is None else (exp.name, str(run.run_id)) return (nni.get_experiment_id(), nni.get_trial_id())
def main(args, params): """ Main program: - Prepare dataset - Build network - Train the model - Report accuracy to tuner - Save best current metrics - Save best current model """ (x_train, y_train), (x_test, y_test) = load_dataset() _logger.info('Dataset loaded') model = tf.keras.Sequential([ tf.keras.layers.Conv2D(filters=32, kernel_size=params['conv_size'], activation='relu'), tf.keras.layers.MaxPool2D(pool_size=2), tf.keras.layers.Conv2D(filters=64, kernel_size=params['conv_size'], activation='relu'), tf.keras.layers.MaxPool2D(pool_size=2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(units=params['hidden_size'], activation='relu'), tf.keras.layers.Dropout(rate=params['dropout_rate']), tf.keras.layers.Dense(units=10, activation='softmax') ]) model.compile( optimizer=tf.keras.optimizers.Adam(lr=params['learning_rate']), loss='sparse_categorical_crossentropy', metrics=['accuracy']) _logger.info('Model built') # Setup TensorBoard log_dir = '{output}/tensorboard/'.format( output=args.output) + nni.get_trial_id() tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) model.fit(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], callbacks=[ReportIntermediates(), tensorboard], validation_data=(x_test, y_test)) _logger.info('Training completed') loss, accuracy = model.evaluate(x_test, y_test, verbose=0) # send final accuracy to NNI tuner and web UI nni.report_final_result(accuracy) # save the best metrics so they are displayed in the Workflow Task is_best_accuracy = save_best_metrics(loss, accuracy) _logger.info('Final accuracy reported: %s', accuracy) # save the model if accuracy is better than previous model if is_best_accuracy: save_data(args, params, model)
def main(args): #### basic torch setup use_cuda = not args['no_cuda'] and torch.cuda.is_available() # use cuda device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} torch.manual_seed(args['seed']) # seed #### data pipeline data_dir = os.path.join(args['data_dir'], nni.get_trial_id()) train_loader = torch.utils.data.DataLoader(datasets.MNIST( data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args['batch_size'], shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=1000, shuffle=True, **kwargs) #### define model hidden_size = args['hidden_size'] model = Net(hidden_size=hidden_size).to(device) optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum']) #### train for epoch in range(1, args['epochs'] + 1): train(args, model, device, train_loader, optimizer, epoch) test_acc = test(args, model, device, test_loader) if epoch < args['epochs']: # report intermediate result nni.report_intermediate_result(test_acc) logger.debug('test accuracy %g', test_acc) logger.debug('Pipe send intermediate result done.') else: # report final result nni.report_final_result(test_acc) logger.debug('Final result is %g', test_acc) logger.debug('Send final result done.')
def train_eval(esargs, RCV_CONFIG, seqid): """ train and eval the model """ global net global best_acc global bs_explore global gpus global hp_path best_acc = 0 parse_rev_args(RCV_CONFIG, esargs) # train procedure trial_id = nni.get_trial_id() available_devices = os.environ["CUDA_VISIBLE_DEVICES"] gpus = len(available_devices.split(",")) is_training = True filenames = ds.get_filenames(args.train_data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) dataset = dataset.flat_map(tf.data.TFRecordDataset) ds_train = ds.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=bs_explore, shuffle_buffer=shuffle_buffer, parse_record_fn=ds.parse_record, num_epochs=args.epochs, npc=args.num_parallel_calls, num_gpus=gpus, examples_per_epoch=examples_per_epoch if is_training else None, dtype=tf.float32) is_training = False filenames = ds.get_filenames(args.val_data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) dataset = dataset.flat_map(tf.data.TFRecordDataset) ds_val = ds.process_record_dataset(dataset=dataset, is_training=is_training, batch_size=bs_explore, shuffle_buffer=shuffle_buffer, parse_record_fn=ds.parse_record, num_epochs=args.epochs, npc=args.num_parallel_calls, num_gpus=gpus, examples_per_epoch=None, dtype=tf.float32) # run epochs and patience loopnum = seqid // args.slave patience = min(int(6 + (2 * loopnum)), 20) if loopnum == 0: run_epochs = int(args.warmup_1) elif loopnum == 1: run_epochs = int(args.warmup_2) elif loopnum == 2: run_epochs = int(args.warmup_3) else: run_epochs = int(args.epochs) # if loopnum < 4: # patience = int(8 + (2 * loopnum)) # run_epochs = int(10 + (20 * loopnum)) # else: # patience = 16 # run_epochs = args.epochs # lr strategy def scheduler2(epoch): lr_max = args.initial_lr total_epochs = args.epochs lr_each_epoch = lr_max - lr_max * epoch / total_epochs return lr_each_epoch callback = tf.keras.callbacks.LearningRateScheduler(scheduler2) # save weights checkpoint_dir = os.environ["HOME"] + "/nni/experiments/" + str( nni.get_experiment_id()) + "/checkpoint/" + str(nni.get_trial_id()) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_filepath = checkpoint_dir + "/weights." + "epoch." + str( run_epochs) + ".hdf5" model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, monitor='val_accuracy', mode='max', save_best_only=True, save_freq='epoch', save_weights_only=True, ) history = net.fit(ds_train, epochs=run_epochs, steps_per_epoch=Ntrain // bs_explore // gpus, validation_data=ds_val, validation_steps=Nvalidation // bs_explore // gpus, verbose=1, shuffle=False, callbacks=[ SendMetrics(hp_path), callback, EarlyStopping(min_delta=0.001, patience=patience), model_checkpoint_callback ]) # trial report final acc to tuner acc = 0 acc_list = history.history['val_accuracy'] for acc_n in acc_list: if float(acc_n) > acc: acc = float(acc_n) try: # predict acc if run_epochs >= 10 and run_epochs < 80: epoch_x = range(1, len(acc_list) + 1) pacc = utils.predict_acc(trial_id, epoch_x, acc_list, 90, True) best_acc = float(pacc) except Exception as E: print("Predict failed.") if acc > best_acc: best_acc = acc logger.debug("Final result is: %.3f", acc) return best_acc, history.epoch[-1]
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--save_dir', help='Location of checkpoint files') parser.add_argument('--vocab_file', help='Vocabulary file') parser.add_argument('--train_prefix', help='Prefix for train files') args = parser.parse_args() ### NNI modification ### params = { 'epoch': 1, 'batch_size': 8, 'optimizer': 'Adam', 'inter_op_parallelism_threads': 1, 'intra_op_parallelism_threads': 2, 'infer_shapes': 0, 'place_pruned_graph': 0, 'enable_bfloat16_sendrecv': 0, 'do_common_subexpression_elimination': 0, 'max_folded_constant': 2, 'do_function_inlining': 0, 'global_jit_level': 1, 'tf_gpu_thread_mode': "global" } tuned_params = nni.get_next_parameter() params.update(tuned_params) t_id = nni.get_trial_id() ### NNI modification ### main(args)
def test_get_trial_id(self): self.assertEqual(nni.get_trial_id(), 'fakeidtr')
if __name__ == "__main__": example_start_time = time.time() net = None args = get_args() try: experiment_path = os.environ[ "HOME"] + "/mountdir/nni/experiments/" + str( nni.get_experiment_id()) lock = multiprocessing.Lock() context = zmq.Context() socket = context.socket(zmq.REQ) tmpstr = 'tcp://' + args.ip + ':800081' socket.connect(tmpstr) os.makedirs(experiment_path + "/trials/" + str(nni.get_trial_id())) get_next_parameter_start = time.time() nni.get_next_parameter(socket) get_next_parameter_end = time.time() while True: lock.acquire() with open(experiment_path + "/graph.txt", "a+") as f: f.seek(0) lines = f.readlines() lock.release() if lines: break if len(lines) > args.slave:
class ClassifyParam: local_model_path = os.path.join( 'data', 'cache', 'classify_{}_{}.model'.format(nni.get_experiment_id(), nni.get_trial_id())) top_n_list = list(range(1, 11)) + [15, 20]
parser.add_argument('--report_metric','-rm',type=str,default=None) parser.add_argument('--nni','-n', action='store_true') # Add data and model specific args parser = LearningDataSet.add_model_specific_args(parser) parser = NeuralNetwork.add_model_specific_args(parser) # add all the available trainer options to argparse parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() if args.nni: trail_id = nni.get_trial_id() results_folder = os.path.join(args.results_path, args.project_name,trail_id) tuned_params = nni.get_next_parameter() else: results_folder = os.path.join(args.results_path, args.project_name) tuned_params = None if args.seed is not None: seed = args.seed torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) arg_groups = {} for group in parser._action_groups:
class NextLocParam: local_model_path = os.path.join( 'data', 'cache', 'next_loc_{}_{}.model'.format(nni.get_experiment_id(), nni.get_trial_id())) local_result_path = os.path.join('data', 'cache', 'next_loc_result.h5') top_n_list = [1, 2, 3, 4, 5, 10, 20]
def generate_parameters(self, parameter_id, **kwargs): """ Returns a set of trial neural architecture, as a serializable object. Parameters ---------- parameter_id : int """ #If there is no history, slave node will use the fake model. if not self.history: print("If there is no history, generate_parameters should not be called!") exit(1) total_start=time.time() rate = 1 if (os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time") and os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time")): with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time", "r") as f: generate_time = float(f.read()) with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time", "r") as f: train_time = float(f.read()) if (generate_time != 0) and (train_time != 0): realrate = int(train_time / generate_time) if (realrate < 5) and (realrate > 1): rate = int(realrate) if (realrate <= 1): rate = 1 for i in range(rate): start=time.time() new_father_id = None generated_graph = None if not self.training_queue: new_father_id, generated_graph = self.generate() father_id,json_out,new_model_id = self.total_data[parameter_id] self.training_queue.append((generated_graph, new_father_id, new_model_id)) #self.descriptors.append(generated_graph.extract_descriptor()) else: print("training_queue should be an empty list.") exit(1) graph, father_id, model_id = self.training_queue.pop(0) # from graph to json json_model_path = os.path.join(self.path, str(model_id) + ".json") json_out = graph_to_json(graph, json_model_path) end=time.time() #self.total_data[parameter_id] = (json_out, father_id, model_id) json_and_id="json_out="+str(json_out)+"+father_id="+str(father_id)+"+parameter_id="+str(parameter_id)+"+history="+"True" lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f: f.write("single_generate=" + str(end - start)+"\n") with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f: f.write(json_and_id+"\n") lock.release() total_end=time.time() lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f: f.write("total_generate=" + str(total_end - total_start)+"\n") lock.release() totime = total_end - total_start if totime<0: totime = 0-totime with open (os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time","w+") as f: gt = totime/rate f.write(str(gt))
def _start_mlflow_run(self, run_params: Dict[str, Any], pipeline: Pipeline): """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet) NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run """ node_tags = functools.reduce(set.union, [n.tags for n in pipeline.nodes]) if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and ( 'train' in run_params['tags'] or 'train' in node_tags): if mlflow.active_run() is None: # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency. # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named) if not deepcv.meta.nni_tools.is_nni_run_standalone( ): # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI nni_experiment = nni.get_experiment_id() mlflow.set_experiment(nni_experiment) mlflow.start_run(run_name=nni.get_trial_id()) # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI) mlflow.set_tag('nni_standalone_mode', False) mlflow.set_tag('nni_experiment_id', nni_experiment) mlflow.set_tag('nni_trial_id', nni.get_trial_id()) mlflow.set_tag('nni_sequence_id', nni.get_sequence_id()) else: pipeline_name = run_params['pipeline_name'].lower( ) if run_params['pipeline_name'] else 'default' mlflow.set_experiment( f'{self.project_ctx.project_name.lower()}_{pipeline_name}' ) mlflow.start_run( run_name= f'{pipeline_name.lower()}_run_{run_params["run_id"]}') mlflow.set_tag('nni_standalone_mode', True) # Log basic informations about Kedro training pipeline to mlflow mlflow.set_tags({ f'kedro_node_tag_{i}': tag for i, tag in enumerate(node_tags) }) mlflow.log_params({n: v for n, v in run_params.items() if v}) mlflow.log_param('pipeline.json', pipeline.to_json()) mlflow.log_param('pipeline.describe', pipeline.describe()) mlflow.log_param('pipeline.pipeline_datasets', pipeline.data_sets()) """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run` """ tags = { mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: self.project_ctx.package_name, mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE: mlflow.entities.SourceType.to_string( mlflow.entities.SourceType.PROJECT), mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT: inspect.getsourcefile(type(self.project_ctx)) } try: repo = git.Repo(self.project_ctx.project_path, search_parent_directories=True) git_repo_url = repo.remote( ).url if 'origin' in repo.remotes else ( repo.remotes[0].url if len(repo.remotes) > 0 else '') git_repo_url = re.sub( r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip( '.git') # Convert SSH git URL to http URL mlflow.log_param( 'commit_url', git_repo_url + f'/commit/{repo.head.commit.hexsha}/') # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now) tags.update({ mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: git_repo_url if git_repo_url else self.project_ctx.project_name, mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH: repo.active_branch.name, mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL: git_repo_url, mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT: repo.head.commit.hexsha }) # Change mlflow user to be git repository user instead of system user (if any git user is specified) git_config_reader = repo.config_reader() git_config_reader.read() user = git_config_reader.get_value('user', 'name', default=None) email = git_config_reader.get_value('user', 'email', default=None) if user or email: tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = ( str(user) + (f' <{email}>' if email else '') ) if user else str(email) except (ImportError, OSError, ValueError, IOError, KeyError, git.GitError, configparser.Error) as e: logging.warning( f'Failed to import Git or to get repository informations. Error: {e}' ) mlflow.set_tags(tags)
def get_next_parameter(socket): """ Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper parameters at each call of get_next_parameter. For a non-multiphase (multiPhase is not configured or set to False) experiment, it returns hyper parameters only on the first call for each trial job, it returns None since second call. This API should be called only once in each trial job of an experiment which is not specified as multiphase. Returns ------- dict A dict object contains the hyper parameters generated by tuner, the keys of the dict are defined in search space. Returns None if no more hyper parameters can be generated by tuner. """ global _params #_params = platform.get_next_parameter() # v1.1 father_id = -1 start = time.time() _params = platform.get_next_parameter() end = time.time() if _params is None: return None socket.send_pyobj({"type": "get_next_parameter"}) message = socket.recv_pyobj() tuner = message["tuner"] if tuner.history: p0 = multiprocessing.Process(target=tuner.generate_parameters, args=(int(get_sequence_id()), )) p0.start() trial_concurrency = os.popen( 'cat /etc/slurm-llnl/slurm.conf|grep NodeName|wc -l') trial_concurrency = int(trial_concurrency.read().strip()) if get_sequence_id() < trial_concurrency: lock.acquire() with open( os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt", "a+") as f: json_and_id = 'json_out=' + str( _params['parameters']) + '+history' + "=False or True?" f.write(json_and_id + "\n") lock.release() else: socket.send_pyobj({"type": "generated_parameter"}) message = socket.recv_pyobj() lock.acquire() with open( os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log", "a+") as f: f.write(" generate=" + str(end - start) + "\n") with open( os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt", "a+") as f: json_and_id = 'json_out=' + str( _params['parameters']) + '+history' + "=False" f.write(json_and_id + "\n") lock.release()
def main(args): use_cuda = not args['no_cuda'] and torch.cuda.is_available() torch.manual_seed(args['seed']) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} data_dir = os.path.join(args['data_dir'], nni.get_trial_id()) train_loader = torch.utils.data.DataLoader( datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args['batch_size'], shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=1000, shuffle=True, **kwargs) hidden_size = args['hidden_size'] model = Net(hidden_size=hidden_size).to(device) save_checkpoint_dir = args['save_checkpoint_dir'] save_checkpoint_path = os.path.join(save_checkpoint_dir, 'model.pth') load_checkpoint_path = os.path.join(args['load_checkpoint_dir'], 'model.pth') if os.path.isfile(load_checkpoint_path): model_state_dict = load_checkpoint(load_checkpoint_path) logger.info("test : " + load_checkpoint_path) logger.info(type(model_state_dict)) model.load_state_dict(model_state_dict) optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum']) #epoch is perturbation interval for epoch in range(1, args['epochs'] + 1): train(args, model, device, train_loader, optimizer, epoch) test_acc = test(args, model, device, test_loader) if epoch < args['epochs']: # report intermediate result nni.report_intermediate_result(test_acc) logger.debug('test accuracy %g', test_acc) logger.debug('Pipe send intermediate result done.') else: # report final result nni.report_final_result(test_acc) logger.debug('Final result is %g', test_acc) logger.debug('Send final result done.') if not os.path.exists(save_checkpoint_dir): os.makedirs(save_checkpoint_dir) save_checkpoint(model, save_checkpoint_path)
def estimate(esargs): global best_acc global trainloader global testloader global net global criterion global optimizer global rank #重置早停对象 early_stop = utils.EarlyStopping(mode="max") global best_acc best_acc = 0 lr_explore = esargs['learning_rate'] bs_explore = int(esargs['batch_size']) global trainloader transform_train, transform_test = utils.data_transforms_cifar10(args) trainset = torchvision.datasets.CIFAR10(root="/root/mountdir/data/", train=True, download=True, transform=transform_train) trainsampler = DistributedSampler(trainset) trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs_explore, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=trainsampler) op = optim.SGD(net.parameters(), lr=lr_explore, momentum=0.9, weight_decay=5e-4) for ep in range(args.epochs): current_ep = ep + 1 if rank == 0: if os.popen("grep epoch " + experiment_path + "/trials/" + str(nni.get_trial_id()) + "/output.log").read(): os.system("sed -i '/^epoch/cepoch=" + str(ep + 1) + "' " + experiment_path + "/trials/" + str(nni.get_trial_id()) + "/output.log") else: os.system("sed -i '$a\\epoch=" + str(ep + 1) + "' " + experiment_path + "/trials/" + str(nni.get_trial_id()) + "/output.log") try: train_acc = train(ep, op) except Exception as exception: f11 = open('/root/log', 'a+') f11.write('###### training is error \n') f11.write(str(exception) + "\n") f11.close() acclist.append(0) return 0, current_ep test_acc, best_acc = test(ep) logger.debug(test_acc) if early_stop.step(test_acc): break list = [best_acc, bs_explore, str(lr_explore)[0:7]] reslist.append(list) acclist.append(best_acc) return best_acc, current_ep
def train_search(config, params=None, warm_start_NN=None, restore_old_checkpoint=False, workers=1, verbosity=0): """ train_search is practically the same as the train function from training_torch, just made for NNI experiments :param config: :param params: :param warm_start_NN: :param restore_old_checkpoint: :param workers: :param verbosity: :return: """ if verbosity == 0: logger.setLevel(logging.INFO) if verbosity >= 1: logger.setLevel(logging.DEBUG) start = time.time() logger.info('Preparing Datasets') train_dataset, validation_dataset = prepare_dataset_torch(config) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True) test_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=params['batch_size'], shuffle=True) logger.info('Initializing Torch Network') net = map_model(config, params) logger.info('Optimizer Initialize') optimizer = map_optimizer(params['optimizer'], net.parameters(), params['learning_rate']) loss_func = map_loss_func(params['loss']) criterion = torch.nn.MSELoss() if config['scheduler']: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config['scheduler_milestones'], gamma=0.1) else: scheduler = None epochs = config['epochs'] # Track the losses to determine early stopping avg_train_loss = [] avg_valid_loss = [] # initalize the early_stopping object early_stopping = EarlyStopping(verbose=True, trace_func=logger.info) logger.info('Start Training!') for epoch in range(epochs): train_loss, validation_loss, RMSE = train_epoch( net, optimizer, loss_func, train_loader=train_loader, test_loader=test_loader, scheduler=scheduler, criterion=criterion) nni.report_intermediate_result(-math.log10(RMSE)) if early_stopping is not None: early_stopping(validation_loss, net, RMSE) RMSE = early_stopping.RMSE avg_train_loss.append(train_loss) avg_valid_loss.append(validation_loss) logger.info( 'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Best Validation RMSE: {:.5}' .format(epoch, train_loss, validation_loss, RMSE)) print( 'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Validation RMSE: {:.5}' .format(epoch, train_loss, validation_loss, RMSE)) if early_stopping.early_stop: logger.info('Early Stopping') RMSE = early_stopping.RMSE break nni.report_final_result(-math.log10(RMSE)) end = time.time() logger.info( 'Training Completed: Time elapsed: {:.2} Seconds'.format(end - start)) plot_against_scaling(net, validation_dataset, criterion, trial_id=str(nni.get_trial_id()), exp_id=str(nni.get_experiment_id()))
def prepare_hyper_search(cfg_kwargs: dict, reporthook=None, final_reporthook=None, primary_key=None, max_key=True, reporter_cls=None, with_keys: (list, str, None) = None, final_keys: (list, str, None) = None, dump=False, disable=False): """ Updated in v1.3.18 从 nni package 中获取超参,更新配置文件参数。当 nni 不可用或不是 nni 搜索模式时,参数将不会改变。 .. code-block :: python cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search( cfg_kwargs, Configuration, reporthook, final_reporthook, primary_key="macro_avg:f1" ) _cfg = Configuration(**cfg_kwargs) model = Model(_cfg) ... for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for batch_data in dataset: train_model(batch_data) data = evaluate_model() reporthook(data) final_reporthook() Parameters ---------- cfg_kwargs: dict 待传入cfg的参数 reporthook final_reporthook primary_key: 评估模型用的主键, ``nni.report_intermediate_result`` 和 ``nni.report_final_result`` 中 ``metric`` 的 ``default`` max_key: bool 主键是越大越好 reporter_cls with_keys: list or str 其它要存储的 metric,final report时默认为 primary_key 最优时指标 final_keys: list or str with_keys 中使用最后一个 report result 而不是 primary_key 最优时指标 dump: bool 为 True 时,会修改 配置文件 中 workspace 参数为 ``workspace/nni.get_experiment_id()/nni.get_trial_id()`` 使得 nni 的中间结果会被存储下来。 disable Returns ------- cfg_kwargs: dict 插入了nni超参后的配置文件参数 reporthook: function 每个iteration结束后的回调函数,用来报告中间结果。 默认 ``nni.report_intermediate_result``。 final_reporthook: 所有iteration结束后的回调函数,用来报告最终结果。 默认 ``nni.report_final_result`` dump: bool 和传入参数保持一致 Examples -------- .. code-block :: python class CFG(Configuration): hyper_params = {"hidden_num": 100} learning_rate = 0.001 workspace = "" cfg_kwargs, reporthook, final_reporthook, dump = prepare_hyper_search( {"learning_rate": 0.1}, CFG, primary_key="macro_avg:f1", with_keys="accuracy" ) # cfg_kwargs: {'learning_rate': 0.1} when nni start (e.g., using ``nni create --config _config.yml``), suppose in ``_config.yml``: .. code-block: yml searchSpacePath: _search_space.json and in ``_search_space.json`` .. code-block :: json { "hidden_num": {"_type": "choice", "_value": [500, 600, 700, 835, 900]}, } one of the return cfg_kwargs is ``{'hyper_params': {'hidden_num': 50}, 'learning_rate': 0.1}`` """ if disable: return cfg_kwargs, None, None, None try: import nni from nni import get_next_parameter, report_intermediate_result, report_final_result assert primary_key is not None def _as_key_list(_keys: (list, str, None)): if isinstance(_keys, str): if ";" in _keys: _keys = _keys.split(";") else: _keys = [_keys] elif isinstance(_keys, list): pass elif _keys is None: _keys = [] return _keys with_keys = _as_key_list(with_keys) final_keys = _as_key_list(final_keys) class Reporter(BaseReporter): def __init__(self): self.datas = [] def intermediate(self, data): feed_dict = { 'default': float(get_by_key(data, key_parser(primary_key))), primary_key: get_by_key(data, key_parser(primary_key)) } for key in with_keys: feed_dict[key] = get_by_key(data, key_parser(key)) report_intermediate_result(feed_dict) self.datas.append(data) def final(self): best_fn = get_min if max_key is False else get_max _with_keys = (with_keys if with_keys else []) + [primary_key] _final_keys = set(final_keys if final_keys else []) final_result = best_fn( self.datas, primary_key, with_keys=";".join(_with_keys), merge=False ) feed_dict = { 'default': float(final_result[0][primary_key]) } appendix_dict = dict(final_result[1][primary_key]) for key in _with_keys: if key in _final_keys: feed_dict[key] = get_by_key(self.datas[-1], key_parser(key)) else: feed_dict[key] = appendix_dict[key] report_final_result(feed_dict) rc = Reporter() if reporter_cls is None else reporter_cls reporthook = reporthook if reporthook is not None else rc.intermediate final_reporthook = final_reporthook if final_reporthook is not None else rc.final cfg_cls_params = get_params(get_next_parameter()) using_nni_tag = True if cfg_cls_params else False nested_update(cfg_kwargs, cfg_cls_params) if using_nni_tag is True and dump is True: # pragma: no cover cfg_kwargs["workspace"] = cfg_kwargs.get("workspace", "") + path_append( nni.get_experiment_id(), nni.get_trial_id(), to_str=True ) return cfg_kwargs, reporthook, final_reporthook, dump except ModuleNotFoundError: # pragma: no cover warnings.warn("nni package not found, skip") return cfg_kwargs, reporthook, final_reporthook, dump
def train_eval(esargs): """ train and eval the model """ global trainloader global testloader global net global best_acc global rank best_acc = 0 lr_explore = esargs['learning_rate'] bs_explore = int(esargs['batch_size']) if args.optimizer == "SGD": optimizer = SGD(lr=lr_explore, momentum=0, decay=args.weight_decay) elif args.optimizer == "Adadelta": optimizer = Adadelta(lr=lr_explore, decay=args.weight_decay) elif args.optimizer == "Adagrad": optimizer = Adagrad(lr=lr_explore, decay=args.weight_decay) elif args.optimizer == "Adam": optimizer = Adam(lr=lr_explore, decay=args.weight_decay) elif args.optimizer == "Adamax": optimizer = Adamax(lr=lr_explore, decay=args.weight_decay) elif args.optimizer == "RMSprop": optimizer = RMSprop(lr=lr_explore, decay=args.weight_decay) else: logger.debug("Input A Wrong optimizer") # Compile the model net.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) (x_train, y_train) = trainloader (x_test, y_test) = testloader # train procedure #使用callback函数记录epoch信息 trial_id = nni.get_trial_id() f11 = open("/root/keras_trace" + str(rank), "a+") f11.write("rank-" + str(rank) + str(trial_id) + "\n") f11.close() available_devices = os.environ["CUDA_VISIBLE_DEVICES"] gpus = len(available_devices.split(",")) #需要打印看看GPU个数 history = net.fit( x=x_train, y=y_train, batch_size=bs_explore * gpus, validation_data=(x_test, y_test), epochs=args.epochs, shuffle=True, callbacks=[ SendMetrics(), Epoch_num_record(experiment_path, trial_id), EarlyStopping(min_delta=0.001, patience=10), TensorBoard(log_dir=TENSORBOARD_DIR), ], ) # trial report final acc to tuner if rank == 0: _, acc = net.evaluate(x_test, y_test) #记录超参搜索期间产生的最优acc f11 = open("/root/log", "a+") f11.write("######acc:" + str(acc) + "\n") f11.close() if acc > best_acc: best_acc = acc logger.debug("Final result is: %.3f", acc) list = [best_acc, bs_explore, str(lr_explore)[0:7]] reslist.append(list) acclist.append(best_acc) return best_acc, history.epoch[-1]
# gpuid=get_gpuid() # client_send(gpuid, 0) # print('gpuid:',gpuid) # os.environ["CUDA_VISIBLE_DEVICES"]="{}".format(gpuid) # print(os.environ["CUDA_VISIBLE_DEVICES"]) # print(torch.cuda.device_count()) args=get_args() tuner_params = nni.get_next_parameter() config=args.__dict__ config.update(tuner_params) SetSeed(args.seed) # config['device']='cuda:{}'.format(gpuid) time_stamp = datetime.datetime.now()+datetime.timedelta(hours=8) config['ex_name'] = time_stamp.strftime('%Y.%m.%d-%H:%M:%S')+nni.get_trial_id() for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) res= Path('/usr/data/gzy/GraphPool/ex_cmp/results')/args.dataset/'{}'.format(config['ex_name']) print(res) res.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(res/'log') sv_param = os.path.join(res, 'model_param.json') with open(sv_param, 'w') as file_obj: json.dump(args.__dict__, file_obj) logging.basicConfig(level=logging.INFO,#控制台打印的日志级别
def is_nni_run_standalone() -> bool: """ Simple helper function which returns whether NNI is in standalone trial run mode """ return nni.get_experiment_id() == r'STANDALONE' and nni.get_trial_id() == r'STANDALONE' and nni.get_sequence_id() == 0
# params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > steps_per_epoch "] = 1 # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > validation_steps "] = 1 # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > depth"] = 5 # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > use_residual"] = True if cfg_alias_cls: params = { cfg_alias_cls.get_cfg_loc(k): v for k, v in params.items() } trial_uuid = generate_uuid() experiment_id = generate_uuid() else: os.environ[ENV_KEY_TRIAL_IN_NNI] = "1" params = nni.get_next_parameter() experiment_id = nni.get_experiment_id() trial_uuid = nni.get_trial_id() if cfg_alias_cls: params = { cfg_alias_cls.get_cfg_loc(k): v for k, v in params.items() } # 对 item 进行 unescape params = {k: unescape_nni_choice_item(v) for k, v in params.items()} yml_path = os.path.join(os.path.dirname(__file__), "../../..", args.cfg) if not os.path.isfile(yml_path): logger.error(f"Default cfg file {yml_path} is not existed!") sys.exit(0) trial_task = HPOTrialPodSideEnv(args.name, yml_path, params, trial_uuid, experiment_id)
# client_send(gpuid, 0) # print('gpuid:',gpuid) # os.environ["CUDA_VISIBLE_DEVICES"]="{}".format(gpuid) # print(os.environ["CUDA_VISIBLE_DEVICES"]) # print(torch.cuda.device_count()) args = get_args() tuner_params = nni.get_next_parameter() config = args.__dict__ config.update(tuner_params) SetSeed(args.seed) # config['device']='cuda:{}'.format(gpuid) time_stamp = datetime.datetime.now() + datetime.timedelta(hours=8) config['ex_name'] = time_stamp.strftime( '%Y.%m.%d-%H:%M:%S') + nni.get_trial_id() for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) res = Path('/usr/data/gzy/GraphPool/ex_cmp/results' ) / args.dataset / '{}'.format(config['ex_name']) print(res) res.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(res / 'log') sv_param = os.path.join(res, 'model_param.json') with open(sv_param, 'w') as file_obj: json.dump(args.__dict__, file_obj) logging.basicConfig(