def train(args, params): ''' Train model ''' model = build_model(params) X_train, Y1_train, Y2_train, X_test, Y1_test, Y2_test = load_dataset( args.dataset_name, args.num_train) print('Fitting model...') results = model.fit( X_train, [Y1_train, Y2_train], epochs=args.epochs, verbose=1, validation_data=(X_test, [Y1_test, Y2_test]), callbacks=[SendMetrics(), TensorBoard(log_dir=TENSORBOARD_DIR)]) _, _, _, cat_acc, subcat_acc = model.evaluate(X_test, [Y1_test, Y2_test], verbose=0) LOG.debug('Final result is: %d', subcat_acc) nni.report_final_result(subcat_acc) print('Final result is: %d', subcat_acc) model_id = nni.get_sequence_id() # serialize model to JSON model_json = model.to_json() with open("model-{}.json".format(model_id), "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights("model-{}.h5".format(model_id)) print("Saved model to disk")
def train(params): ''' Train model ''' x_train, y_train, x_test, y_test = load_mnist_data() model = create_mnist_model(params) epochs = 10 model.fit(x_train, y_train, batch_size=params['batch_size'], epochs=epochs, verbose=1, validation_data=(x_test, y_test), callbacks=[SendMetrics()]) _, acc = model.evaluate(x_test, y_test, verbose=0) logger.debug('Final result is: %d', acc) nni.report_final_result(acc) model_id = nni.get_sequence_id() model_json = model.to_json() with open('./ckpt/model-{}.json'.format(model_id), 'w') as json_file: json_file.write(model_json) model.save_weights('./ckpt/model-{}.h5'.format(model_id))
def setup_experiment( runtime_config: RuntimeConfig, enable_nni: bool = False, logger_blacklist: Optional[List[str]] = None) -> RuntimeConfig: if logger_blacklist is None: logger_blacklist = ['numba'] setup_distributed_training() seed_everything(runtime_config.seed) if runtime_config.output_dir is None: if 'PT_OUTPUT_DIR' in os.environ: runtime_config.output_dir = Path(os.environ['PT_OUTPUT_DIR']) else: runtime_config.output_dir = Path('./outputs') if enable_nni: import nni if nni.get_experiment_id() != 'STANDALONE': runtime_config.output_dir = runtime_config.output_dir / nni.get_experiment_id( ) / str(nni.get_sequence_id()) runtime_config.output_dir.mkdir(exist_ok=True) if runtime_config.checkpoint_dir is None: runtime_config.checkpoint_dir = runtime_config.output_dir / 'checkpoints' runtime_config.checkpoint_dir.mkdir(exist_ok=True) if runtime_config.tb_log_dir is None: runtime_config.tb_log_dir = runtime_config.output_dir / 'tb' runtime_config.tb_log_dir.mkdir(exist_ok=True) reset_logger() setup_logger( '', log_file=(runtime_config.output_dir / 'stdout.log').as_posix(), log_level=logging.DEBUG if runtime_config.debug else logging.INFO) for logger in logger_blacklist: mute_logger(logger) global _runtime_config _runtime_config = runtime_config return runtime_config
parser.add_argument('--BatchSize', default='32', type=int) # device parameter parser.add_argument('--Device', default='0', type=str) # version control parser.add_argument('--CodeVersion', default='V') # Merge parameter parser.add_argument('--MergeIndex', default=6, type=int) parser.add_argument('--MergeWay', default='sum', type=str) return parser parser = stmeta_param_parser() args = vars(parser.parse_args()) nni_params = nni.get_next_parameter() nni_sid = nni.get_sequence_id() if nni_params: args.update(nni_params) args['CodeVersion'] += str(nni_sid) model_dir = os.path.join('model_dir', args['City']) code_version = 'ST_MMGCN_{}_K{}L{}_{}_F{}'.format( ''.join([e[0] for e in args['Graph'].split('-')]), args['K'], args['L'], args['CodeVersion'], int(args['MergeIndex']) * 5) deviceIDs = GPUtil.getAvailable(order='memory', limit=2, maxLoad=1, maxMemory=0.7, includeNan=False,
parser.add_argument('--Device', default='0,1', type=str) # version control parser.add_argument('--Group', default='Xian') parser.add_argument('--CodeVersion', default='ParamTuner') return parser parser = cpt_stmeta_param_parser() args = vars(parser.parse_args()) args.update(nni.get_next_parameter()) model_dir = os.path.join(model_dir_path, args['Group']) code_version = 'CPT_STMeta_{}_K{}L{}_{}'.format( ''.join([e[0] for e in args['Graph'].split('-')]), args['K'], args['L'], args['CodeVersion'] + nni.get_sequence_id()) # Config data loader data_loader = NodeTrafficLoader( dataset=args['Dataset'], city=args['City'], data_range=args['DataRange'], train_data_length=args['TrainDays'], test_ratio=0.1, C_T=int(args['CT']), P_T=int(args['PT']), T_T=int(args['TT']), TI=args['TI'], TD=args['TD'], TC=args['TC'], normalize=True if args['Normalize'] == 'True' else False,
def test_get_sequence_id(self): self.assertEqual(nni.get_sequence_id(), 0)
f.seek(0) lines = f.readlines() lock.release() if lines: break if len(lines) > args.slave: x = random.randint(1, args.slave) json_and_id_str = lines[-x].replace("\n", "") else: json_and_id_str = lines[-1].replace("\n", "") with open( experiment_path + "/trials/" + str(nni.get_trial_id()) + "/output.log", "a+") as f: f.write("sequence_id=" + str(nni.get_sequence_id()) + "\n") json_and_id = dict((l.split('=') for l in json_and_id_str.split('+'))) if str(json_and_id['history']) == "True": socket.send_pyobj({ "type": "generated_parameter", "parameters": json_and_id['json_out'], "father_id": int(json_and_id['father_id']), "parameter_id": int(nni.get_sequence_id()) }) message = socket.recv_pyobj() elif str(json_and_id['history']) == "False": socket.send_pyobj({"type": "generated_parameter"}) message = socket.recv_pyobj() RCV_CONFIG = json_and_id['json_out'] start_time = time.time()
def _start_mlflow_run(self, run_params: Dict[str, Any], pipeline: Pipeline): """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet) NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run """ node_tags = functools.reduce(set.union, [n.tags for n in pipeline.nodes]) if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and ( 'train' in run_params['tags'] or 'train' in node_tags): if mlflow.active_run() is None: # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency. # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named) if not deepcv.meta.nni_tools.is_nni_run_standalone( ): # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI nni_experiment = nni.get_experiment_id() mlflow.set_experiment(nni_experiment) mlflow.start_run(run_name=nni.get_trial_id()) # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI) mlflow.set_tag('nni_standalone_mode', False) mlflow.set_tag('nni_experiment_id', nni_experiment) mlflow.set_tag('nni_trial_id', nni.get_trial_id()) mlflow.set_tag('nni_sequence_id', nni.get_sequence_id()) else: pipeline_name = run_params['pipeline_name'].lower( ) if run_params['pipeline_name'] else 'default' mlflow.set_experiment( f'{self.project_ctx.project_name.lower()}_{pipeline_name}' ) mlflow.start_run( run_name= f'{pipeline_name.lower()}_run_{run_params["run_id"]}') mlflow.set_tag('nni_standalone_mode', True) # Log basic informations about Kedro training pipeline to mlflow mlflow.set_tags({ f'kedro_node_tag_{i}': tag for i, tag in enumerate(node_tags) }) mlflow.log_params({n: v for n, v in run_params.items() if v}) mlflow.log_param('pipeline.json', pipeline.to_json()) mlflow.log_param('pipeline.describe', pipeline.describe()) mlflow.log_param('pipeline.pipeline_datasets', pipeline.data_sets()) """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run` """ tags = { mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: self.project_ctx.package_name, mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE: mlflow.entities.SourceType.to_string( mlflow.entities.SourceType.PROJECT), mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT: inspect.getsourcefile(type(self.project_ctx)) } try: repo = git.Repo(self.project_ctx.project_path, search_parent_directories=True) git_repo_url = repo.remote( ).url if 'origin' in repo.remotes else ( repo.remotes[0].url if len(repo.remotes) > 0 else '') git_repo_url = re.sub( r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip( '.git') # Convert SSH git URL to http URL mlflow.log_param( 'commit_url', git_repo_url + f'/commit/{repo.head.commit.hexsha}/') # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now) tags.update({ mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: git_repo_url if git_repo_url else self.project_ctx.project_name, mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH: repo.active_branch.name, mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL: git_repo_url, mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT: repo.head.commit.hexsha }) # Change mlflow user to be git repository user instead of system user (if any git user is specified) git_config_reader = repo.config_reader() git_config_reader.read() user = git_config_reader.get_value('user', 'name', default=None) email = git_config_reader.get_value('user', 'email', default=None) if user or email: tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = ( str(user) + (f' <{email}>' if email else '') ) if user else str(email) except (ImportError, OSError, ValueError, IOError, KeyError, git.GitError, configparser.Error) as e: logging.warning( f'Failed to import Git or to get repository informations. Error: {e}' ) mlflow.set_tags(tags)
f1 = open(experiment_path + "/graph.txt", "a+") f1.seek(0) lines = f1.readlines() f1.close() lock.release() if lines: break json_and_id_str = lines[-1].replace("\n", "") #逆序读取并记录,数据组成字典 json_and_id = dict( (l.split('=') for l in json_and_id_str.split('+'))) if str(json_and_id['history']) == "True": socket.send_pyobj({ "type": "generated_parameter", "parameters": json_and_id['json_out'], "father_id": int(json_and_id['father_id']), "parameter_id": int(nni.get_sequence_id()) }) f11 = open('/root/log', 'a+') f11.write('histtory is True so \nsend parameters') f11.close() message = socket.recv_pyobj() f11 = open('/root/log', 'a+') f11.write('recv message: ' + str(message) + '\n') f11.close() elif str(json_and_id['history']) == "False": socket.send_pyobj({"type": "generated_parameter"}) f11 = open('/root/log', 'a+') f11.write('history is false so \nsend generated_parameter\n') f11.close() message = socket.recv_pyobj() f11 = open('/root/log', 'a+')
def is_nni_run_standalone() -> bool: """ Simple helper function which returns whether NNI is in standalone trial run mode """ return nni.get_experiment_id() == r'STANDALONE' and nni.get_trial_id() == r'STANDALONE' and nni.get_sequence_id() == 0
help='set tolerance of termination criterion') parser.add_argument('--verbose', type=int, default=0, help='train verbosity level (0 is less verbosity)') parser.add_argument('--debug', action="store_true", help="info debug information") parser.add_argument('--log-to-file', type=bool, default=False, help="Save log to file") args = vars(parser.parse_args()) # return as dictionary args['id'] = nni.get_sequence_id() log_level = logging.DEBUG if args['debug'] else logging.INFO if args['log_to_file']: log_file = os.path.join(args['output'], 'execution-{}.log'.format(args['id'])) logging.basicConfig(filename=log_file, level=log_level) else: logging.basicConfig(level=log_level) RECEIVED_PARAMS = nni.get_next_parameter() has_nni = True if RECEIVED_PARAMS is None: """this only occurs if you call this python module from the command line with using nnictl """ RECEIVED_PARAMS = default_params() has_nni = False