from config import CONFIG #from config import CONFIG_DEV as CONFIG # Only for development. from constants import REPO_DIR logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s - %(pathname)s: line %(lineno)d' )) logger.addHandler(handler) # Get the current run. run = Run.get_context() offline_run = run.id.startswith("OfflineRun") if offline_run: utils_dir_path = REPO_DIR / "cgmml/common/model_utils" utils_paths = glob.glob(os.path.join(utils_dir_path, "*.py")) temp_model_util_dir = Path(__file__).parent / "tmp_model_util" # Remove old temp_path if os.path.exists(temp_model_util_dir): shutil.rmtree(temp_model_util_dir) # Copy os.mkdir(temp_model_util_dir) os.system(f'touch {temp_model_util_dir}/__init__.py') for p in utils_paths:
def main(): print("Running train.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--build_id", type=str, help="The build ID of the build triggering this pipeline run", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sklearn_regression_model.pkl", ) parser.add_argument( "--step_output", type=str, help=("output for passing data to next step") ) args = parser.parse_args() print("Argument [build_id]: %s" % args.build_id) print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) model_name = args.model_name build_id = args.build_id step_output_path = args.step_output print("Getting training parameters") alpha = 0.5 print("Parameter alpha: %s" % alpha) run = Run.get_context() # Get the dataset dataset = run.input_datasets['training_data'] if (dataset): df = dataset.to_pandas_dataframe() X = df.values y = df.Y else: e = ("No dataset provided") print(e) raise Exception(e) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} reg = train_model(run, data, alpha) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=reg, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=reg, filename=output_path) # Add properties to identify this specific training run run.parent.tag("BuildId", value=build_id) run.tag("BuildId", value=build_id) run.tag("run_type", value="train") builduri_base = os.environ.get("BUILDURI_BASE") if (builduri_base is not None): build_uri = builduri_base + build_id run.tag("BuildUri", value=build_uri) run.parent.tag("BuildUri", value=build_uri) print(f"tags now present for run: {run.tags}") run.complete()
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format( parent_run_id)) logger.info( "[ParentRunId:{}]: Start getting data using dataprep.".format( parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in [ 'X', 'X_valid', 'sample_weight', 'sample_weight_valid' ]: fit_iteration_parameters_dict[ k] = dataprep_utilities.try_retrieve_pandas_dataframe( dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[ k] = dataprep_utilities.try_retrieve_numpy_array( dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj[ 'datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict[ 'x_raw_column_names'] = _X.columns.values except ImportError: logger.info( "SDK version does not support column names extraction, fallback to old path" ) fit_iteration_parameters_dict[ 'X'] = dataprep_utilities.try_retrieve_pandas_dataframe( X) try: fit_iteration_parameters_dict[ 'y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}". format(parent_run_id, e.__class__, e)) logger.error( "[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}". format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str( e): logger.debug( "User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="porto_seguro_safe_driver_model.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step"), default="outputs") parser.add_argument("--dataset_version", type=str, help=("dataset version"), default=1) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['train_dataset'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train train_df = dataset.to_pandas_dataframe() train_data, valid_data = split_data(train_df) # Train the model model = train_model(train_data, valid_data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, train_data, valid_data) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=model, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): num_classes = 3 # create checkpoint dir out_dir = './outputs' if args.out_dir is None else args.out_dir checkpoint_dir = os.path.join(out_dir, experiment_name, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) # write logs to ./logs, which AML uploads to Artifact Service and makes available to a TensorBoard instance. # also log some metrics through AML's Run object run = Run.get_context() logger_train = Logger('train', './logs', run) logger_val = Logger('val', './logs', run) log_sample_img_gt(sample_images_train, sample_images_val, logger_train, logger_val) logging.info('Logged ground truth image samples') # larger model if model_choice == 'unet': model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) # year 2 best solution XD_XD's model, as the baseline model elif model_choice == 'unet_baseline': model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) else: sys.exit('Invalid model_choice {}, choose unet_baseline or unet'.format(model_choice)) model = model.to(device=device, dtype=dtype) # move the model parameters to CPU/GPU criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device, dtype=dtype) # can also use Nesterov momentum in optim.SGD # optimizer = optim.SGD(model.parameters(), lr=learning_rate, # momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # resume from a checkpoint if provided starting_epoch = 0 best_acc = 0.0 if os.path.isfile(starting_checkpoint_path): logging.info('Loading checkpoint from {0}'.format(starting_checkpoint_path)) checkpoint = torch.load(starting_checkpoint_path) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) starting_epoch = checkpoint['epoch'] best_acc = checkpoint.get('best_acc', 0.0) else: logging.info('No valid checkpoint is provided. Start to train from scratch...') model.apply(weights_init) if evaluate_only: val_loss, val_acc = evaluate(loader_val, model, criterion) print('Evaluated on val set, loss is {}, accuracy is {}'.format(val_loss, val_acc)) return step = starting_epoch * len(dset_train) for epoch in range(starting_epoch, num_epochs): logging.info('Epoch {} of {}'.format(epoch, num_epochs)) # train for one epoch step = train(loader_train, model, criterion, optimizer, epoch, step, logger_train) # evaluate on val set logging.info('Evaluating model on the val set at the end of epoch {}...'.format(epoch)) val_loss, val_acc = evaluate(loader_val, model, criterion) logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format(epoch, step, val_loss, val_acc)) logger_val.scalar_summary('val_loss', val_loss, step + 1) logger_val.scalar_summary('val_acc', val_acc, step + 1) # TODO log the val images too # record the best accuracy; save checkpoint for every epoch is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint_epoch{}_{}.pth.tar'.format(epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) logging.info( 'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}'.format( checkpoint_path, str(is_best))) save_checkpoint({ 'epoch': epoch + 1, # saved checkpoints are numbered starting from 1 'arch': model_choice, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, is_best, checkpoint_path, checkpoint_dir)
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str, help="Name of the Model", default="COVID19Articles_model_github.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) datastore_name = os.environ.get("DATASTORE_NAME") model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() # Get the dataset if (dataset_name): if (data_file_path == ""): if (dataset_name in Dataset.get_all(run.experiment.workspace).keys()): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, version=dataset_version) else: create_sample_data_csv(run.experiment.workspace, datastore_name) dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name) else: dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) else: if (data_file_path == ""): data_file_path = "COVID19Articles.csv" create_sample_data_csv(run.experiment.workspace, datastore_name) dataset_name = "COVID19Articles_Training_githubactions" dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) class_args = {"max_depth": 5} # Train the model model = train_model(data, class_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) # files saved in the "outputs" folder are automatically uploaded into run history model_file_name = "COVID19Articles_model.pkl" joblib.dump(model, os.path.join('outputs', model_file_name)) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def run_train_from_args( args, hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None: # Get the housekeeping going and start logging: os.makedirs(args.save_dir, exist_ok=True) run_id = make_run_id(args.model, args.task) log_file = os.path.join(args.save_dir, f"{run_id}.log") def log(msg): log_line(log_file, msg) log(f"Setting random seed {args.random_seed}.") random.seed(args.random_seed) np.random.seed(args.random_seed) tf.random.set_seed(args.random_seed) data_path = RichPath.create(args.data_path, args.azure_info) #second path data_path_2 = RichPath.create( os.path.split(args.data_path)[0] + '/test2', args.azure_info) data_path_3 = RichPath.create( os.path.split(args.data_path)[0] + '/test3', args.azure_info) ##new_inputs try: dataset, model = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides= hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, case_name=args.case, # add by zjq ) #second dataset2, model_2 = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path_2, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides= hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, case_name=args.case, # add by zjq ) ##new_inputs dataset3, model_3 = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path_3, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides= hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, case_name=args.case, # add by zjq ) except ValueError as err: print(err.args) log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}" ) log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}" ) if args.azureml_logging: from azureml.core.run import Run aml_run = Run.get_context() else: aml_run = None if not args.load_trained_model: trained_model_path = train( model, dataset, dataset2, dataset3, log_fun=log, run_id=run_id, max_epochs=args.max_epochs, patience=args.patience, save_dir=args.save_dir, quiet=args.quiet, aml_run=aml_run, ) else: trained_model_path = args.load_trained_model #new_inputs if args.run_test: data_path = RichPath.create(args.data_path, args.azure_info) data_path_2 = RichPath.create( os.path.split(args.data_path)[0] + '/test2', args.azure_info) data_path_3 = RichPath.create( os.path.split(args.data_path)[0] + '/test3', args.azure_info) log("== Running on test dataset") log(f"Loading data from {data_path}.") dataset.load_data(data_path, {DataFold.TEST}) dataset2.load_data(data_path_2, {DataFold.TEST}) dataset3.load_data(data_path_3, {DataFold.TEST}) log(f"Restoring best model state from {trained_model_path}.") load_weights_verbosely(trained_model_path, model) test_data_1 = dataset.get_tensorflow_dataset(DataFold.TEST) test_data_2 = dataset2.get_tensorflow_dataset(DataFold.TEST) test_data_3 = dataset3.get_tensorflow_dataset(DataFold.TEST) _, _, test_results = model.run_one_epoch_new(test_data_1, test_data_2, test_data_3, training=False, quiet=args.quiet) test_metric, test_metric_string = model.compute_epoch_metrics( test_results) log(test_metric_string) nni.report_final_result(float(test_metric_string.split(" ")[-1])) if aml_run is not None: aml_run.log("task_test_metric", float(test_metric))
def main(unused_argv): data_root = os.path.join("outputs", "MNIST") mnist = None tf_config = os.environ.get("TF_CONFIG") if not tf_config or tf_config == "": raise ValueError("TF_CONFIG not found.") tf_config_json = json.loads(tf_config) cluster = tf_config_json.get('cluster') job_name = tf_config_json.get('task', {}).get('type') task_index = tf_config_json.get('task', {}).get('index') job_name = "worker" if job_name == "master" else job_name sentinel_path = os.path.join(data_root, "complete.txt") if job_name == "worker" and task_index == 0: mnist = input_data.read_data_sets(data_root, one_hot=True) with open(sentinel_path, 'w+') as f: f.write("download complete") else: while not os.path.exists(sentinel_path): time.sleep(0.01) mnist = input_data.read_data_sets(data_root, one_hot=True) if FLAGS.download_only: sys.exit(0) print("job name = %s" % job_name) print("task index = %d" % task_index) print("number of GPUs = %d" % FLAGS.num_gpus) # Construct the cluster and start the server cluster_spec = tf.train.ClusterSpec(cluster) # Get the number of workers. num_workers = len(cluster_spec.task_indices("worker")) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server( cluster_spec, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() is_chief = (task_index == 0) if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter( worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): global_step = tf.Variable(0, name="global_step", trainable=False) # Variables of the hidden layer hid_w = tf.Variable( tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # Variables of the softmax layer sm_w = tf.Variable( tf.truncated_normal( [FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # Ops: located on the worker specified with task_index x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) y_ = tf.placeholder(tf.float32, [None, 10]) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_step = opt.minimize(cross_entropy, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp() if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % task_index) else: print("Worker %d: Waiting for session to be initialized..." % task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + task_index print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 while True: # Training feed batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys} _, step = sess.run([train_step, global_step], feed_dict=train_feed) local_step += 1 now = time.time() print("%f: Worker %d: training step %d done (global step: %d)" % (now, task_index, local_step, step)) if step >= FLAGS.train_steps: break time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) # Validation feed val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} val_xent = sess.run(cross_entropy, feed_dict=val_feed) print("After %d training step(s), validation cross entropy = %g" % (FLAGS.train_steps, val_xent)) if job_name == "worker" and task_index == 0: run = Run.get_context() run.log("CrossEntropy", val_xent)
def main(unused_argv): data_root = os.path.join("outputs", "MNIST") mnist = None tf_config = os.environ.get("TF_CONFIG") if not tf_config or tf_config == "": raise ValueError("TF_CONFIG not found.") tf_config_json = json.loads(tf_config) cluster = tf_config_json.get('cluster') job_name = tf_config_json.get('task', {}).get('type') task_index = tf_config_json.get('task', {}).get('index') job_name = "worker" if job_name == "master" else job_name sentinel_path = os.path.join(data_root, "complete.txt") if job_name == "worker" and task_index == 0: mnist = input_data.read_data_sets(data_root, one_hot=True) with open(sentinel_path, 'w+') as f: f.write("download complete") else: while not os.path.exists(sentinel_path): time.sleep(0.01) mnist = input_data.read_data_sets(data_root, one_hot=True) if FLAGS.download_only: sys.exit(0) print("job name = %s" % job_name) print("task index = %d" % task_index) print("number of GPUs = %d" % FLAGS.num_gpus) # Construct the cluster and start the server cluster_spec = tf.train.ClusterSpec(cluster) # Get the number of workers. num_workers = len(cluster_spec.task_indices("worker")) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server(cluster_spec, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() is_chief = (task_index == 0) if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter(worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): global_step = tf.Variable(0, name="global_step", trainable=False) # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # Ops: located on the worker specified with task_index x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) y_ = tf.placeholder(tf.float32, [None, 10]) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) cross_entropy = -tf.reduce_sum( y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_step = opt.minimize(cross_entropy, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp() if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor(is_chief=is_chief, logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % task_index) else: print("Worker %d: Waiting for session to be initialized..." % task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + task_index print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 while True: # Training feed batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys} _, step = sess.run([train_step, global_step], feed_dict=train_feed) local_step += 1 now = time.time() print("%f: Worker %d: training step %d done (global step: %d)" % (now, task_index, local_step, step)) if step >= FLAGS.train_steps: break time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) # Validation feed val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} val_xent = sess.run(cross_entropy, feed_dict=val_feed) print("After %d training step(s), validation cross entropy = %g" % (FLAGS.train_steps, val_xent)) if job_name == "worker" and task_index == 0: run = Run.get_context() run.log("CrossEntropy", val_xent)
def main(root_dir: str, model_info_dir: str, val_dir: str, output_dir: str, labels: str) -> None: """ Main function for receiving args, and passing them through to form recognizer postprocessing function Parameters ---------- root_dir: str Root datastore being used model_info_dir: str Directory containing trained custom model information val_dir: str Directory containing test images output_dir: str Path to save outputs to labels: str Labels or fields to extract results from """ log.info("Evaluation step") # get context of current run run = Run.get_context() # set form recognizer credentials form_credentials = { "key": run.get_secret("formkey"), "endpoint": run.get_secret("formendpoint") } # process labels string to array labels = [label.strip() for label in labels.split(",")] model_info_dir = join(root_dir, model_info_dir) val_dir = join(root_dir, val_dir) output_dir = join(root_dir, output_dir) # read in model information log.info("Compile model information") model_fname = "model.json" with open(join(model_info_dir, model_fname), "r") as model_info_file: model_info = json.load(model_info_file) log.info(model_info) # Processing image files images = [] for file in os.listdir(val_dir): images.append({"image": file}) # convert array of dict objects to a pandas dataframe image_df = pd.DataFrame(images) # use lambda function to aply full path to each image file image_df["image"] = image_df["image"].apply(lambda x: join(val_dir, x)) log.info("Evaluate Form Recognizer Model") detection_rates = get_detection_rates(form_credentials=form_credentials, model_id=model_info["modelId"], image_df=image_df, output_dir=output_dir, labels=labels) # Log metrics for metric_info in detection_rates: # extract and log detection rate for each object per video scene_rate = metric_info["scene_detection_rate"] take_rate = metric_info["take_detection_rate"] run.parent.log(name="scene_detection_rate", value=scene_rate) run.parent.log(name="take_detection_rate", value=take_rate) log.info("Finished model evaluation")
def main(): print("Running train.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--build_id", type=str, help="The build ID of the build triggering this pipeline run", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sklearn_regression_model.pkl", ) parser.add_argument("--dataset_name", type=str, help=("Dataset with the training data")) args = parser.parse_args() print("Argument [build_id]: %s" % args.build_id) print("Argument [model_name]: %s" % args.model_name) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name build_id = args.build_id dataset_name = args.dataset_name print("Getting training parameters") with open("config.json") as f: pars = json.load(f) try: alpha = pars["training"]["alpha"] except KeyError: alpha = 0.5 print("Parameter alpha: %s" % alpha) run = Run.get_context() ws = run.experiment.workspace if (dataset_name): dataset = Dataset.get_by_name(workspace=ws, name=dataset_name) df = dataset.to_pandas_dataframe() X = df.values y = df.Y else: X, y = load_diabetes(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) data = { "train": { "X": X_train, "y": y_train }, "test": { "X": X_test, "y": y_test } } reg = train_model(run, data, alpha) joblib.dump(value=reg, filename=model_name) # upload model file explicitly into artifacts for parent run run.parent.upload_file(name="./outputs/" + model_name, path_or_stream=model_name) print("Uploaded the model {} to experiment {}".format( model_name, run.experiment.name)) dirpath = os.getcwd() print(dirpath) print("Following files are uploaded ") print(run.parent.get_file_names()) run.parent.tag("BuildId", value=build_id) # Add properties to identify this specific training run run.tag("BuildId", value=build_id) run.tag("run_type", value="train") builduri_base = os.environ.get("BUILDURI_BASE") if (builduri_base is not None): build_uri = builduri_base + build_id run.tag("BuildUri", value=build_uri) run.parent.tag("BuildUri", value=build_uri) print(f"tags now present for run: {run.tags}") run.complete()
from __future__ import division from __future__ import print_function import sys import os import shutil import argparse import math import tensorflow as tf from azureml.core.run import Run ##### Modified # Get run when running in remote ##### Modified if 'run' not in locals(): ##### Modified run = Run.get_context() ##### Modified FLAGS = None batch_size = 100 # # define functions for Estimator # def _my_input_fn(filepath, num_epochs): # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1] # label - digit (0, 1, ..., 9) data_queue = tf.train.string_input_producer( [filepath], num_epochs=num_epochs ) # data is repeated and it raises OutOfRange when data is over
def run(args): if args.supress_warnings: warnings.simplefilter("ignore") def adjust_path(p): return os.path.join(args.data_root_dir, p) args.label_encoder = adjust_path(args.label_encoder) args.all_imgs_csv = adjust_path(args.all_imgs_csv) args.val_imgs_csv = adjust_path(args.val_imgs_csv) args.test_imgs_csv = adjust_path(args.test_imgs_csv) args.results_dir = adjust_path(args.results_dir) print(args) from multihead_trainer import train from multihead_trainer import torch_transform # TODO: consolidate logid def build_logid_string(args, add_timestamp=True): param_str = "lr{}_dr{}_lrpatience{}_lrfactor{}_{}".format( args.init_lr, args.dropout, args.lr_patience, args.lr_factor, args.appearance_network) if add_timestamp: param_str += "_" + datetime.datetime.now().strftime("%Y%m%d%H%M") return param_str param_str = build_logid_string(args) # Azure ML from azureml.core.run import Run run = Run.get_context() # log arguments if it's not called by train_cv if not hasattr(args, 'folds_csv_dir'): for k, v in vars(args).items(): run.tag(k, str(v)) save_path = os.path.join(args.results_dir, param_str) os.makedirs(save_path, exist_ok=True) print("save_path", save_path) logger.info( f"cuda.is_available={torch.cuda.is_available()}, n_gpu={torch.cuda.device_count()}" ) # encode the classes from sklearn.preprocessing import LabelEncoder import pickle if not os.path.exists(args.label_encoder): logger.warning(f"Fitting a new label encoder at {args.label_encoder}") all_imgs_df = pd.read_csv(args.all_imgs_csv) label_encoder = LabelEncoder() label_encoder.fit(all_imgs_df['label']) pickle.dump(label_encoder, open(args.label_encoder, "wb")) else: logger.info(f"Loading label encoder: {args.label_encoder}") with open(args.label_encoder, 'rb') as pickle_file: label_encoder = pickle.load(pickle_file) logger.info(f"label_encoder.classes_={label_encoder.classes_}") logger.info("The label encoder has {} classes.".format( len(label_encoder.classes_))) # Load image list all_images_df = pd.read_csv(args.all_imgs_csv) val_df = pd.read_csv(args.val_imgs_csv) test_df = pd.read_csv(args.test_imgs_csv) for df in [all_images_df, val_df, test_df]: df['image_path'] = df['image_path'].apply( lambda x: os.path.join(args.data_root_dir, args.img_dir, x)) val_test_image_paths = list(val_df['image_path'].values) + list( test_df['image_path'].values) train_df = all_images_df[~all_images_df['image_path']. isin(val_test_image_paths)] ref_only_df = train_df[train_df['is_ref']] cons_train_df = train_df[train_df['is_ref'] == False] cons_val_df = val_df print("all_images", len(all_images_df), "train", len(train_df), "val", len(val_df), "test", len(test_df)) run.log("all_images_size", len(all_images_df)) run.log("train_size", len(train_df)) run.log("val_size", len(val_df)) run.log("test_size", len(test_df)) print("ref_only_df", len(ref_only_df), "cons_train_df", len(cons_train_df), "cons_val_df", len(cons_val_df)) import classif_utils classif_utils.ClassificationDataset.set_datadir( os.path.join(args.data_root_dir, args.img_dir)) def plot_pr_curve(plt, dataset_name): run.log_image(name='{}_{}_{}'.format( dataset_name, datetime.datetime.now().strftime("%H:%M:%S"), 'PR-curve'), plot=plt) plt.close() def log_metrics(metrics_results, dataset_name): from metrics import create_prec_inds_str import matplotlib matplotlib.use('Agg') #backend that doesn't display to the user import matplotlib.pyplot as plt import matplotlib.image as mpimg run_metrics = [] for k, v in metrics_results.items(): if ('p_indices' in k) and not ('sanity' in dataset_name): pind_str = create_prec_inds_str(v, label_encoder) run.log("{}_{}".format(dataset_name, k), pind_str) run_metrics.append([ os.path.split(args.val_imgs_csv)[1], dataset_name, k, pind_str ]) elif isinstance(v, (int, float)): run.log("{}_{}".format(dataset_name, k), v) run_metrics.append( [os.path.split(args.val_imgs_csv)[1], dataset_name, k, v]) return run_metrics #if da_train, models is actually a dictionary with F1, F2 and G model, val_metrics = train(ref_only_df, cons_train_df, cons_val_df, label_encoder, torch_transform, 'label', args.batch_size, len(label_encoder.classes_), args, args.max_epochs, results_dir=save_path, add_perspective=args.add_persp_aug) print('completed train()') print('val_metrics', val_metrics) run_metrics_list = log_metrics(val_metrics, 'val') predictions_dfs_list = [] from sanitytest_eval import create_eval_dataloaders evaluator = MetricEmbeddingEvaluator( model, args.metric_simul_sidepairs_eval, sidepairs_agg_method=args.sidepairs_agg, metric_evaluator_type=args.metric_evaluator_type) logit_evaluator = LogitEvaluator(model, args.metric_simul_sidepairs_eval, sidepairs_agg_method=args.sidepairs_agg) #figures out label column for sanity test def get_labelcol_eval(de_imgs_df): #figuring out if it is a pilltype_id or label_prod_code encoder #to set the label column of the sanity test set labels_df = pd.DataFrame({'label': label_encoder.classes_}) img_df = pd.merge(de_imgs_df, labels_df, left_on=['label_prod_code'], right_on=['label'], how='inner') if len(img_df) > 1: labelcol = 'label_prod_code' else: labelcol = 'pilltype_id' print('Selecting {} for sanity test label'.format(labelcol)) return de_imgs_df[labelcol] def test_model(de_imgs_df, evaluator, dataset_name, run_metrics_list, predictions_dfs_list, rotate_aug=None): if rotate_aug is not None: dataset_name += "_rotate_aug{}".format(rotate_aug) print("Evaluating", dataset_name) eval_dataloader, eval_dataset = create_eval_dataloaders( de_imgs_df, label_encoder, torch_transform, 'label', 24, rotate_aug=rotate_aug) ref_dataloader, _ = create_eval_dataloaders(ref_only_df, label_encoder, torch_transform, 'label', 24, rotate_aug=rotate_aug) dataloader = {'ref': ref_dataloader, 'eval': eval_dataloader} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Eval {}: {} images from {} total images".format( dataset_name, len(eval_dataset), len(de_imgs_df))) metrics_results, predictions = evaluator.eval_model( device, dataloader, do_pr_metrics=True, add_single_side_eval=True) plot_pr_curve(metrics_results['PR-curve'], dataset_name) run_metrics_list += log_metrics(metrics_results, dataset_name) predictions['dataset'] = dataset_name predictions['val_imgs_csv'] = os.path.split(args.val_imgs_csv)[1] predictions_dfs_list.append(predictions) return metrics_results, predictions test_model(test_df, logit_evaluator, 'holdout-logit', run_metrics_list, predictions_dfs_list) test_model(test_df, evaluator, 'holdout', run_metrics_list, predictions_dfs_list) run_metrics_df = pd.DataFrame( run_metrics_list, columns=['val_imgs_csv', 'dataset', 'name', 'value']) all_predictions_df = pd.concat(predictions_dfs_list, ignore_index=True) # make sure to save both for target_save_dir in [save_path, 'outputs']: print(f'saving predictions {target_save_dir}') # TODO: this csv can be large. Update the format for the numpy array of prediction scores. os.makedirs(target_save_dir, exist_ok=True) all_predictions_df.to_csv( os.path.join( target_save_dir, 'eval_predictions_{}'.format( os.path.basename(args.val_imgs_csv)))) torch.save( model.state_dict(), os.path.join(save_path, '{}.pth'.format(os.path.basename(args.val_imgs_csv)))) return run_metrics_df, all_predictions_df
def load_and_clean(dataset_name): dataframe = load_data(dataset_name) dataframe = extract_features(dataframe) features, labels = clean_data(dataframe) return features, labels try: # Get workspace if run locally ws = Workspace.from_config() except: # Get workspace if run remotely ws = Run.get_context().experiment.workspace # Run run = Run.get_context() # Load and clean data features_train, labels_train = load_and_clean('energy-forecast-data-training') features_val, labels_val = load_and_clean('energy-forecast-data-validation') def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--n_estimators', type=int,
test_size=0.2, random_state=42) vectorizer = CountVectorizer() vectorizer.fit(x_train) X_train = vectorizer.transform(x_train) X_test = vectorizer.transform(x_test) test_data = X_test[1, :] test_data_array = test_data.toarray() test_data_list = test_data_array.tolist() print("len test_data_list", len(test_data_list)) print("len test_data_list 0", len(test_data_list[0])) with open("test_data.txt", "w") as fp: json.dump(test_data_list, fp) run = Run.get_context(allow_offline=True) def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( "--C", type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument("--max_iter", type=int,
def plot(y_true, y_pred, output_eval_dir): run = Run.get_context() # # Confusion matrix # skplt.metrics.plot_confusion_matrix(convert_sentence_to_token(y_true), # convert_sentence_to_token(y_pred), normalize=True) # run.log_image("metrics/confusion_matrix", plot=plt) # plt.savefig(os.path.join(output_eval_dir, 'confusion_matrix.png')) # # plt.show() # Metric df_metrics = get_metrics(y_true, y_pred) type_name_list = df_metrics['type_name'].tolist() ps = df_metrics['precision'].tolist() rs = df_metrics['recall'].tolist() f1s = df_metrics['f1-score'].tolist() s = df_metrics['support'].tolist() # Metric F1-Score f1_plt = plt.figure(2) plt.title('F1-Score') plt.bar(range(len(type_name_list)), f1s, tick_label=type_name_list, fc='b') for x, y in zip(range(len(type_name_list)), f1s): plt.text(x, y, "%0.4f" % y, ha='center', va='bottom') plt.ylim([0, 1.1]) plt.ylabel('F1-Score') plt.xlabel('Name Entity Type') run.log_image("metrics/f1_score", plot=f1_plt) f1_plt.savefig(os.path.join(output_eval_dir, 'f1_score.png')) # plt.show() # Metric Precision precision_plt = plt.figure(3) plt.title('Precision') plt.bar(range(len(type_name_list)), ps, tick_label=type_name_list, fc='y') for x, y in zip(range(len(type_name_list)), ps): plt.text(x, y, "%0.4f" % y, ha='center', va='bottom') plt.ylim([0, 1.1]) plt.ylabel('Precision') plt.xlabel('Name Entity Type') run.log_image("metrics/precision", plot=precision_plt) precision_plt.savefig(os.path.join(output_eval_dir, 'precision.png')) # plt.show() # Metric Recall recall_plt = plt.figure(4) plt.title('Recall') plt.bar(range(len(type_name_list)), rs, tick_label=type_name_list, fc='y') for x, y in zip(range(len(type_name_list)), rs): plt.text(x, y, "%0.4f" % y, ha='center', va='bottom') plt.ylim([0, 1.1]) plt.ylabel('Recall') plt.xlabel('Name Entity Type') run.log_image("metrics/recall", plot=recall_plt) recall_plt.savefig(os.path.join(output_eval_dir, 'recall.png')) # plt.show() # Metric AllTrueInstanceCnt gt_plt = plt.figure(5) plt.title('AllTrueInstanceCnt') plt.bar(range(len(type_name_list)), s, tick_label=type_name_list, fc='g') for x, y in zip(range(len(type_name_list)), s): plt.text(x, y, "%d" % y, ha='center', va='bottom') plt.ylabel('AllTrueInstanceCnt') plt.xlabel('Name Entity Type') run.log_image("metrics/ground_truth", plot=gt_plt) gt_plt.savefig(os.path.join(output_eval_dir, 'ground_truth.png'))
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sales_model.h5", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train global original_df original_df = dataset.to_pandas_dataframe() #print("Original_df") (train, test) = tts(original_df) (X_train, y_train, X_test, y_test, scaler_object) = \ scale_data(train, test) #print("inside main x_test", X_test) model = lstm_model(train, test) # Log the metrics for the model metrics = get_model_metrics() #metrics = {"mse": mse} print(metrics) #for (k, v) in metrics.items(): #print(f"{k}: {v}") # Train the model model = lstm_model(train, test) #Saving the model # model.save("sales_forecast_model.h5") # Evaluate and log the metrics returned from the train function #metrics = get_model_metrics(model, train, test) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step # model_output_path = "outputs/sales_forecast_model.pkl" os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) keras.models.save_model(model, model_output_path) print("Saved model in model_output_path") # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) keras.models.save_model(model, output_path) print("Model saved") run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def display_instances(image, out_folder, out_filename, boxes, masks, class_ids, class_names, scores=None, title="", figsize=(16, 16), ax=None, show_mask=True, show_bbox=True, colors=None, captions=None): """ boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates. masks: [height, width, num_instances] class_ids: [num_instances] class_names: list of class names of the dataset scores: (optional) confidence scores for each box title: (optional) Figure title show_mask, show_bbox: To show masks and bounding boxes or not figsize: (optional) the size of the image colors: (optional) An array or colors to use with each object captions: (optional) A list of strings to use as captions for each object """ run = Run.get_context() # img_plt = plt.figure(1) # Number of instances N = boxes.shape[0] if not N: print("\n*** No instances to display *** \n") else: assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0] # If no axis is passed, create one and automatically call show() # auto_show = False if not ax: # img_ax = plt.figure(1) _, ax = plt.subplots(1, figsize=figsize) # img_ax = ax.figure(1) # auto_show = True # Generate random colors colors = colors or random_colors(N) # Show area outside image boundaries. height, width = image.shape[:2] ax.set_ylim(height + 10, -10) ax.set_xlim(-10, width + 10) ax.axis('off') ax.set_title(title) masked_image = image.astype(np.uint32).copy() for i in range(N): color = colors[i] # Bounding box if not np.any(boxes[i]): # Skip this instance. Has no bbox. Likely lost in image cropping. continue y1, x1, y2, x2 = boxes[i] if show_bbox: p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, alpha=0.7, linestyle="dashed", edgecolor=color, facecolor='none') ax.add_patch(p) # Label if not captions: class_id = class_ids[i] score = scores[i] if scores is not None else None label = class_names[class_id] caption = "{} {:.3f}".format(label, score) if score else label else: caption = captions[i] ax.text(x1, y1 + 8, caption, color='w', size=11, backgroundcolor="none") # Mask mask = masks[:, :, i] if show_mask: masked_image = apply_mask(masked_image, mask, color) # Mask Polygon # Pad to ensure proper polygons for masks that touch image edges. padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8) padded_mask[1:-1, 1:-1] = mask contours = find_contours(padded_mask, 0.5) for verts in contours: # Subtract the padding and flip (y, x) to (x, y) verts = np.fliplr(verts) - 1 p = Polygon(verts, facecolor="none", edgecolor=color) ax.add_patch(p) ax.imshow(masked_image.astype(np.uint8)) fig = ax.get_figure() run.log_image("prediction/" + out_filename, plot=fig) fig.savefig(os.path.join(out_folder, '{}.jpg'.format(out_filename)))
def main( training_data_path=None, validation_data_path=None, use_gpu=False, save_filepath=None, model="resnet50", epochs=_EPOCHS, batch_size=_BATCHSIZE, fp16_allreduce=False, base_lr=0.0125, warmup_epochs=5, ): logger = logging.getLogger(__name__) device = torch.device("cuda" if use_gpu else "cpu") logger.info(f"Running on {device}") if _DISTRIBUTED: # Horovod: initialize Horovod. logger.info("Running Distributed") torch.manual_seed(_SEED) if use_gpu: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(_SEED) logger.info("PyTorch version {}".format(torch.__version__)) # Horovod: write TensorBoard logs on first worker. if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED: run = Run.get_context() run.tag("model", value=model) logs_dir = os.path.join(os.curdir, "logs") if os.path.exists(logs_dir): logger.debug(f"Log directory {logs_dir} found | Deleting") shutil.rmtree(logs_dir) summary_writer = SummaryWriter(logdir=logs_dir) if training_data_path is None: logger.info("Setting up fake loaders") train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor) validation_dataset = None else: normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD) logger.info("Setting up loaders") logger.info(f"Loading training from {training_data_path}") train_dataset = datasets.ImageFolder( training_data_path, transforms.Compose([ transforms.RandomResizedCrop(_WIDTH), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), ) if validation_data_path is not None: logger.info(f"Loading validation from {validation_data_path}") validation_dataset = datasets.ImageFolder( validation_data_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), ) train_sampler = _get_sampler(train_dataset) kwargs = {"num_workers": 5, "pin_memory": True} train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) if validation_data_path is not None: val_sampler = _get_sampler(validation_dataset) val_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, sampler=val_sampler, **kwargs) # Autotune cudnn.benchmark = True logger.info("Loading model") # Load symbol model = models.__dict__[model](pretrained=False) # model.to(device) if use_gpu: # Move model to GPU. model.cuda() # # Horovod: (optional) compression algorithm. # compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none num_gpus = hvd.size() if _DISTRIBUTED else 1 # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus, momentum=0.9) if _DISTRIBUTED: compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, ) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) criterion = F.cross_entropy # Main training-loop logger.info("Training ...") for epoch in range(epochs): with Timer(output=logger.info, prefix=f"Training epoch {epoch} ") as t: model.train() if _DISTRIBUTED: train_sampler.set_epoch(epoch) metrics = train(train_loader, model, criterion, optimizer, base_lr, warmup_epochs, epoch) if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED: run.log_row("Training metrics", epoch=epoch, **metrics) summary_writer.add_scalar("Train/Loss", metrics["loss"], epoch) summary_writer.add_scalar("Train/Acc", metrics["acc"], epoch) summary_writer.add_scalar("Train/BatchTime", metrics["batch_time"], epoch) if validation_data_path is not None: model.eval() metrics = validate(val_loader, model, criterion, device) if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED: run.log_row("Validation metrics", epoch=epoch, **metrics) summary_writer.add_scalar("Validation/Loss", metrics["loss"], epoch) summary_writer.add_scalar("Validation/Acc", metrics["acc"], epoch) if save_filepath is not None: save_checkpoint(model, optimizer, save_filepath) _log_summary(epochs * len(train_dataset), t.elapsed, batch_size)
def main(): parser = argparse.ArgumentParser("RAPIDS_DBSCAN") parser.add_argument("--data_dir", type=str, help="Location of data") parser.add_argument('-f', type=str, default='') # added for notebook execution scenarios args = parser.parse_args() data_dir = args.data_dir run = Run.get_context() # specify the location of the data files DATA_PATH = data_dir # the sample PCAP file used for explanation DATA_PCAP = DATA_PATH + "/small_sample.pcap" # the flow connection log (conn.log) file DATA_SOURCE = DATA_PATH + "/conn.log" # the data label file (matches IP addresses with MAC addresses) DATA_LABELS = DATA_PATH + "/lab_mac_labels_cats.csv" print("Running NETWORK FLOW on GPU...") t1 = datetime.now() # ### Background ##### Types of Network Data # The most detailed type of data that is typically collected on a network is full Packet CAPture (PCAP) data. This information is detailed and contains everything about the communication, including: source address, destination address, protocols used, bytes transferred, and even the raw data (e.g., image, audio file, executable). PCAP data is fine-grained, meaning that there is a record for each frame being transmitted. A typical communication is composed of many individual packets/frames. # # If we aggregate PCAP data so that there is one row of data per communication session, we call that flow level data. A simplified example of this relationship is shown in the figure below. # # ![PCAP_flow_relationship](images/pcap_vs_flow.png "PCAP vs FLOW") # # For this tutorial, we use data from the University of New South Wales. In a lab environment, they [collected nearly three weeks of IoT data from 21 IoT devices](http://149.171.189.1). They also kept a detailed [list of devices by MAC address](http://149.171.189.1/resources/List_Of_Devices.txt), so we have ground-truth with respect to each IoT device's behavior on the network. # # **Our goal is to utilize the behavior exhibited in the network data to classify IoT devices.** ##### The Internet of Things and Data at a Massive Scale # Gartner estimates there are currently over 8.4 billion Internet of Things (IoT) devices. By 2020, that number is [estimated to surpass 20 billion](https://www.zdnet.com/article/iot-devices-will-outnumber-the-worlds-population-this-year-for-the-first-time/). These types of devices range from consumer devices (e.g., Amazon Echo, smart TVs, smart cameras, door bells) to commercial devices (e.g., building automation systems, keycard entry). All of these devices exhibit behavior on the Internet as they communicate back with their own clouds and user-specified integrations. ### Data Investigation # Let's first see some of the data. We'll load a PCAP file in using Scapy. If you don't want to or can't install Scapy, feel free to skip this section. cap = rdpcap(DATA_PCAP) # get the frames eth_frame = cap[3] ip_pkt = eth_frame.payload segment = ip_pkt.payload data = segment.payload print(eth_frame.show()) # There's really a lot of features there. In addition to having multiple layers (which may differ between packets), there are a number of other issues with working directly with PCAP. Often the payload (the `Raw` section above) is encrypted, rendering it useless. The lack of aggregation also makes it difficult to differentiate between packets. What we really care about for this application is what a *session* looks like. In other words, how a Roku interacts with the network is likely quite different than how a Google Home interacts. # # To save time for the tutorial, all three weeks of PCAP data have already been transformed to flow data, and we can load that in to a typical Pandas dataframe. Due to how the data was created, we have a header row (with column names) as well as a footer row. We've already removed those rows, so nothing to do here. # # For this application, we used [Zeek](https://www.zeek.org) (formerly known as Bro) to construct the flow data. To include MAC addresses in the conn log, we used the [mac-logging.zeek script](https://github.com/bro/bro/blob/master/scripts/policy/protocols/conn/mac-logging.zeek). # # # If you've skipped installing Scapy, you can pick up here. # pdf = pd.read_csv(DATA_SOURCE, sep=\'\t') # print("==> pdf shape: ", pdf.shape) # # We can look at what this new aggregated data looks like, and get a better sense of the columns and their data types. Let's do this the way we're familiar with, using Pandas. # print(pdf.head()) # pdf.dtypes # That's Pandas, and we could continue the analysis there if we wanted. But what about [cuDF](https://github.com/rapidsai/cudf)? Let's pivot to that for the majority of this tutorial. # # One thing cuDF neeeds is for us to specify the data types. We'll write a function to make this easier. As of version 0.6, [strings are supported in cuDF](https://rapidsai.github.io/projects/cudf/en/latest/10min.html?highlight=string#String-Methods). We'll make use of that here. def get_dtypes(fn, delim, floats, strings): with open(fn, errors='replace') as fp: header = fp.readline().strip() types = [] for col in header.split(delim): if 'date' in col: types.append((col, 'date')) elif col in floats: types.append((col, 'float64')) elif col in strings: types.append((col, 'str')) else: types.append((col, 'int64')) return OrderedDict(types) dtypes_data_processed = get_dtypes(DATA_SOURCE, '\t', floats=['ts', 'duration'], strings=[ 'uid', 'id.orig_h', 'id.resp_h', 'proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history', 'tunnel_parents', 'orig_l2_addr', 'resp_l2_addr' ]) raw_cdf = cd.io.csv.read_csv(DATA_SOURCE, delimiter='\t', names=list(dtypes_data_processed), dtype=list(dtypes_data_processed.values()), skiprows=1) # Those data types seem right. Let's see what this data looks like now that it's in cuDF. # ### Adding ground truth labels back to the data # We'll need some labels for our classification task, so we've already prepared a file with those labels. dtypes_labels_processed = get_dtypes( DATA_LABELS, ',', floats=[], strings=['device', 'mac', 'connection', 'category']) labels_cdf = cd.io.csv.read_csv(DATA_LABELS, delimiter=',', names=list(dtypes_labels_processed), dtype=list( dtypes_labels_processed.values()), skiprows=1) print('Labels...') print(labels_cdf.head()) # We now perform a series of merges to add the ground truth data (device name, connection, category, and categoryID) back to the dataset. Since each row of netflow has two participants, we'll have to do this twice - once for the originator (source) and once for the responder (destination). labels_cdf.columns = [ 'orig_device', 'orig_l2_addr', 'orig_connection', 'orig_category', 'orig_category_id' ] merged_cdf = cd.merge(raw_cdf, labels_cdf, how='left', on='orig_l2_addr') labels_cdf.columns = [ 'resp_device', 'resp_l2_addr', 'resp_connection', 'resp_category', 'resp_category_id' ] merged_cdf = cd.merge(merged_cdf, labels_cdf, how='left') labels_cdf.columns = [ 'device', 'mac', 'connection', 'category', 'category_id' ] # Let's just look at our new dataset to make sure everything's okay. print('Merged...') print(merged_cdf.head()) # ### Exploding the Netflow Data into Originator and Responder Rows # We now have netflow that has one row per (sessionized) communication between an originator and responder. However, in order to classify an individual device, we need to explode data. Instead of one row that contains both originator and responder, we'll explode to one row for originator information (orig_bytes, orig_pkts, orig_ip_bytes) and one for responder information (resp_bytes, resp_pkts, resp_ip_bytes). # # The easiest way to do this is to create two new dataframes, rename all of the columns, then `concat` them back together. Just for sanity, we'll also check the new shape of our exploded data frame. orig_comms_cdf = merged_cdf[[ 'ts', 'id.orig_h', 'id.orig_p', 'proto', 'service', 'duration', 'orig_bytes', 'orig_pkts', 'orig_ip_bytes', 'orig_device', 'orig_l2_addr', 'orig_category', 'orig_category_id' ]] orig_comms_cdf.columns = [ 'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts', 'ip_bytes', 'device', 'mac', 'category', 'category_id' ] resp_comms_cdf = merged_cdf[[ 'ts', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration', 'resp_bytes', 'resp_pkts', 'resp_ip_bytes', 'resp_device', 'resp_l2_addr', 'resp_category', 'resp_category_id' ]] resp_comms_cdf.columns = [ 'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts', 'ip_bytes', 'device', 'mac', 'category', 'category_id' ] exploded_cdf = cd.multi.concat([orig_comms_cdf, resp_comms_cdf]) print("==> shape (original) =", merged_cdf.shape) print("==> shape =", exploded_cdf.shape) num_categories = labels_cdf['category_id'].unique().shape[0] print("==> number of IoT categories =", num_categories) # We currently need to remove null values before we proceed. Although `dropna` doesn't exist in cuDF yet, we can use a workaround to get us there. Also, due to what's available currently, we can't have any nulls in any place in the DF. print('Check if any missing...') for col in exploded_cdf.columns: print(col, exploded_cdf[col].null_count) exploded_cdf['category_id'] = exploded_cdf['category_id'].fillna(-999) exploded_cdf['device'] = exploded_cdf['device'].str.fillna("none") exploded_cdf['category'] = exploded_cdf['category'].str.fillna("none") print('After missing observations imputation...') for col in exploded_cdf.columns: print(col, exploded_cdf[col].null_count) # Looks like all the null values are gone, so now we can proceed. If an IP doesn't have a category ID, we can't use it. So we'll filter those out. exploded_cdf = exploded_cdf[exploded_cdf['category_id'] != -999] # ### Binning the Data and Aggregating the Features # # But wait, there's still more data wrangling to be done! While we've exploded the flows into rows for orig/resp, we may want to bin the data further by time. The rationale is that any single communication may not be an accurate representation of how a device typically reacts in its environment. Imagine the simple case of how a streaming camera typically operates (most of its data will be uploaded from the device to a destination) versus how it operates during a firmware update (most of the data will be pushed down to the device, after which a brief disruption in connectivity will occur). # # There's a lof ot different time binning we could do. It also would be useful to investigate what the average duration of connection is relative to how many connections per time across various time granularities. With that said, we'll just choose a time bin of 1 hour to begin with. In order to bin, we'll use the following formula: # # $$\text{hour_time_bin}=\left\lfloor{\frac{ts}{60*60}}\right\rfloor$$ exploded_cdf['hour_time_bin'] = exploded_cdf['ts'].applymap( lambda x: math.floor(x / (60 * 60))).astype(int) # We also have to make a choice about how we'll aggregate the binned data. One of the simplest ways is to sum the bytes and packets. There are really two choices for bytes, `bytes` and `ip_bytes`. With Bro, `bytes` is taken from the TCP sequence numbers and is potentially inaccurate, so we select `ip_bytes` instead for both originator and responder. We'll also use the sum of the number of packets. one_hour_time_bin_cdf = (exploded_cdf[[ 'bytes', 'pkts', 'ip_bytes', 'mac', 'category_id', 'hour_time_bin' ]].groupby(['mac', 'category_id', 'hour_time_bin']).agg({ 'category_id': 'min', 'bytes': 'sum', 'pkts': 'sum', 'ip_bytes': 'sum' })[['min_category_id', 'sum_bytes', 'sum_pkts', 'sum_ip_bytes']]) one_hour_time_bin_cdf.columns = [ 'category_id', 'bytes', 'pkts', 'ip_bytes' ] # ### Creating the Training and Testing Datasets # We'll take a traditional 70/30 train/test split, and we'll randomly sample into a train and test data frame. cdf_msk = np.random.rand(len(one_hour_time_bin_cdf)) < 0.7 train_cdf = one_hour_time_bin_cdf[cdf_msk] test_cdf = one_hour_time_bin_cdf[~cdf_msk] print("==> train length =", len(train_cdf)) print("==> test length =", len(test_cdf)) run.log('Train length', len(train_cdf)) run.log('Test length', len(test_cdf)) # Prepare the training input (`train_X`), training target (`train_Y`), test input (`test_X`) and test target (`test_Y`) datasets. train_X = train_cdf[['pkts', 'ip_bytes']] train_Y = train_cdf[['category_id']] test_X = test_cdf[['pkts', 'ip_bytes']] test_Y = test_cdf[['category_id']] # ### Configure XGBoost # We choose a classification algorithm that utilizes the GPU - [XGBoost](https://xgboost.readthedocs.io/en/latest/). The package provides support for gradient boosted trees and can leverage distributed GPU compute environments. # Getting data into a format for XGBoost is really easy. Just make a `DMatrix` for both training and testin. xg_train = xgb.DMatrix(train_X, label=train_Y) xg_test = xgb.DMatrix(test_X, label=test_Y) # Like any good ML package, there's quite a few parameters to set. We're going to start with the softmax objective function. This will let us get a predicted category out of our model. We'll also set other parameters like the maximum depth and number of threads. You can read more about the parameters [here](https://xgboost.readthedocs.io/en/latest/parameter.html). Experiment with them! param = {} param['objective'] = 'multi:softmax' param['eta'] = 0.1 param['max_depth'] = 8 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = num_categories param['max_features'] = 'auto' param['n_gpus'] = 1 param['tree_method'] = 'gpu_hist' # XGBoost allows us to define a watchlist so what we can keep track of performance as the algorithm trains. We'll configure a simple watchlist that is watching `xg_train` and `xg_gest` error rates. watchlist = [(xg_train, 'train'), (xg_test, 'test')] num_round = 20 # ### Training our First XGBoost Model # Now it's time to train bst = xgb.train(param, xg_train, num_round, watchlist) # Prediction is also easy (and fast). pred = bst.predict(xg_test) # We might want to get a sense of how our model is by calculating the error rate. pred_cdf = cd.from_pandas(pd.DataFrame(pred, columns=['pred'])) pred_cdf.add_column('category_id', test_Y['category_id']) error_rate = (pred_cdf[pred_cdf['pred'] != pred_cdf['category_id']] ['pred'].count()) / test_Y.shape[0] run.log('Error rate', error_rate) t2 = datetime.now() run.log('Runtime', t2 - t1)
def run_train_from_args( args, hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None: # Get the housekeeping going and start logging: os.makedirs(args.save_dir, exist_ok=True) run_id = make_run_id(args.model, args.task, args.run_name) log_file = os.path.join(args.save_dir, f"{run_id}.log") def log(msg): log_line(log_file, msg) log(f"Setting random seed {args.random_seed}.") random.seed(args.random_seed) np.random.seed(args.random_seed) tf.random.set_seed(args.random_seed) data_path = RichPath.create(args.data_path, args.azure_info) dataset, model = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides=hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, ) log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}" ) log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}" ) if args.azureml_logging: from azureml.core.run import Run aml_run = Run.get_context() else: aml_run = None trained_model_path = train( model, dataset, log_fun=log, run_id=run_id, max_epochs=args.max_epochs, patience=args.patience, save_dir=args.save_dir, quiet=args.quiet, aml_run=aml_run, ) if args.run_test: data_path = RichPath.create(args.data_path, args.azure_info) log("== Running on test dataset") log(f"Loading data from {data_path}.") dataset.load_data(data_path, {DataFold.TEST}) log(f"Restoring best model state from {trained_model_path}.") load_weights_verbosely(trained_model_path, model) # Test 1: Simply compute same metrics used during training/validation: test_data = dataset.get_tensorflow_dataset(DataFold.TEST) _, _, test_results = model.run_one_epoch(test_data, training=False, quiet=args.quiet) test_metric, test_metric_string = model.compute_epoch_metrics( test_results) log(test_metric_string) if aml_run is not None: aml_run.log("task_test_metric", float(test_metric)) # Test 2: Try to compute fancier metrics, if implemented: try: eval_metrics = model.evaluate_model(test_data) for metric_name, metric_value in eval_metrics.items(): log(f"{metric_name:<30}: {metric_value:8.4f}") if aml_run is not None: aml_run.log(f"task_test_{metric_name}", metric_value) except NotImplementedError: pass # ignore if there are no fancier metrics
def __init__(self, args): self.args = args self.run = Run.get_context() super(AMLChannel, self).__init__(args) self.current_message_index = -1
def run(arguments): if arguments["--aml"]: from azureml.core.run import Run aml_ctx = Run.get_context() assert torch.cuda.is_available(), "No CUDA available. Aborting training." else: aml_ctx = None log_path = configure_logging(aml_ctx) azure_info_path = arguments.get("--azure-info", None) training_data_path = RichPath.create(arguments["TRAIN_DATA_PATH"], azure_info_path) training_data = LazyDataIterable(lambda: load_from_folder(training_data_path, shuffle=True)) validation_data_path = RichPath.create(arguments["VALID_DATA_PATH"], azure_info_path) validation_data = LazyDataIterable( lambda: load_from_folder(validation_data_path, shuffle=False) ) model_path = Path(arguments["MODEL_FILENAME"]) assert model_path.name.endswith(".pkl.gz"), "MODEL_FILENAME must have a `.pkl.gz` suffix." initialize_metadata = True restore_path = arguments.get("--restore-path", None) if restore_path: initialize_metadata = False model, nn = Graph2Class.restore_model(Path(restore_path)) elif arguments["--aml"] and model_path.exists(): initialize_metadata = False model, nn = Graph2Class.restore_model(model_path) else: nn = None model = create_graph2class_gnn_model() def create_optimizer(parameters): return torch.optim.Adam(parameters, lr=0.00025) trainer = ModelTrainer( model, model_path, max_num_epochs=int(arguments["--max-num-epochs"]), minibatch_size=int(arguments["--minibatch-size"]), optimizer_creator=create_optimizer, clip_gradient_norm=1, target_validation_metric="Accuracy", target_validation_metric_higher_is_better=True, enable_amp=arguments["--amp"], ) if nn is not None: trainer.neural_module = nn trainer.register_train_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "train", model, epoch, metrics) ) trainer.register_validation_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "valid", model, epoch, metrics) ) trainer.train( training_data, validation_data, show_progress_bar=not arguments["--quiet"], initialize_metadata=initialize_metadata, parallelize=not arguments["--sequential-run"], patience=10, store_tensorized_data_in_memory=True, ) test_data_path = RichPath.create(arguments["TEST_DATA_PATH"], azure_info_path) test_data = LazyDataIterable(lambda: load_from_folder(test_data_path, shuffle=False)) acc = model.report_accuracy( test_data, trainer.neural_module, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), ) print(f"Test accuracy: {acc:%}") if aml_ctx is not None: aml_ctx.log("Test Accuracy", acc) aml_ctx.upload_file(name="model.pkl.gz", path_or_stream=str(model_path)) aml_ctx.upload_file(name="full.log", path_or_stream=log_path)
def run_train_from_args( args, hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None: # Get the housekeeping going and start logging: os.makedirs(args.save_dir, exist_ok=True) run_id = make_run_id(args.model, args.task) log_file = os.path.join(args.save_dir, f"{run_id}.log") def log(msg): log_line(log_file, msg) log(f"Setting random seed {args.random_seed}.") random.seed(args.random_seed) np.random.seed(args.random_seed) tf.random.set_seed(args.random_seed) #data split a = args.data_path DataSplit.Preprocess(args.data_path) data_path = RichPath.create( os.path.split(args.data_path)[0] + '/tem/ast', args.azure_info) #second path data_path_2 = RichPath.create( os.path.split(args.data_path)[0] + '/tem/cdfg', args.azure_info) try: dataset, model = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides= hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, case_name=args.case, # add by zjq ) #second dataset2, model_2 = get_model_and_dataset( msg_passing_implementation=args.model, task_name=args.task, data_path=data_path_2, trained_model_file=args.load_saved_model, cli_data_hyperparameter_overrides=args.data_param_override, cli_model_hyperparameter_overrides=args.model_param_override, hyperdrive_hyperparameter_overrides= hyperdrive_hyperparameter_overrides, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}, load_weights_only=args.load_weights_only, case_name=args.case, # add by zjq ) except ValueError as err: print(err.args) log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}" ) log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}" ) if args.azureml_logging: from azureml.core.run import Run aml_run = Run.get_context() else: aml_run = None # add by zjq if not args.load_trained_model: trained_model_path = train( model, dataset, dataset2, log_fun=log, run_id=run_id, max_epochs=args.max_epochs, patience=args.patience, save_dir=args.save_dir, quiet=args.quiet, aml_run=aml_run, ) else: trained_model_path = args.load_trained_model if args.run_test: data_path = RichPath.create( os.path.split(args.data_path)[0] + '/tem/ast', args.azure_info) data_path_2 = RichPath.create( os.path.split(args.data_path)[0] + '/tem/cdfg', args.azure_info) log("== Running on test dataset") log(f"Loading data from {data_path}.") dataset.load_data(data_path, {DataFold.TEST}) dataset2.load_data(data_path_2, {DataFold.TEST}) log(f"Restoring best model state from {trained_model_path}.") load_weights_verbosely(trained_model_path, model) test_data_1 = dataset.get_tensorflow_dataset(DataFold.TEST) test_data_2 = dataset2.get_tensorflow_dataset(DataFold.TEST) _, _, test_results = model.run_one_epoch_new(test_data_1, test_data_2, training=False, quiet=args.quiet) valid_ACC, val_stracc, \ best_valid_Pre, best_val_strpre, \ best_valid_metric_RE, best_val_strre, \ best_valid_metric_f1, best_val_strf1, \ best_valid_metric_TPR, best_val_strtpr, \ best_valid_metric_FPR, best_val_strfpr, \ best_valid_metric_TNR, best_val_strtnr, \ best_valid_metric_FNR, best_val_strfnr, = model.compute_epoch_metrics(test_results) # valid_metric, valid_metric_string = model.compute_epoch_metrics(valid_results) log( f" {val_stracc}|{best_val_strpre} | {best_val_strre} | {best_val_strf1} |" f"{best_val_strtpr} | {best_val_strfpr} | {best_val_strtnr} | {best_val_strfnr} |", ) # test_metric, test_metric_string = model.compute_epoch_metrics(test_results) # log(val_stracc) if aml_run is not None: aml_run.log("task_test_metric", float(valid_ACC))
def main(): # Add arguments to script parser = argparse.ArgumentParser() # See lightgbm library for python for a list of parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html parser.add_argument('--n_estimators', type=int, default=100, help="number of boosting iterations") parser.add_argument('--learning_rate', type=float, default=0.1, help="shrinkage rate") parser.add_argument('--max_depth', type=int, default=-1, help="max depth for tree model") parser.add_argument( '--subsample', type=float, default=1.0, help= "randomly select part of data without resampling. useful to speed up training and prevent over-fitting" ) args = parser.parse_args() run = Run.get_context() run.log("n_estimators:", np.int(args.n_estimators)) run.log( "learning_rate:", np.float(args.learning_rate) ) # see here for more ideas = https://bit.ly/3c2zJOm & https://bit.ly/3o6OAth run.log("max_depth:", np.int(args.max_depth)) run.log("subsample:", np.float(args.subsample)) # training set train_split_data = run.input_datasets["output_split_train"] # train_split_data = train_split_data.parse_parquet_files() train_split_df = train_split_data.to_pandas_dataframe() print(train_split_df.head(10)) x_train = train_split_df.loc[:, train_split_df.columns != 'Exited'] y_train = train_split_df.loc[:, train_split_df.columns == 'Exited'] #evaluation set test_split_data = run.input_datasets["output_split_test"] test_split_df = test_split_data.to_pandas_dataframe() x_test = test_split_df.loc[:, test_split_df.columns != 'Exited'] y_test = test_split_df.loc[:, test_split_df.columns == 'Exited'] print(x_train.head(10)) print(x_test.head(10)) # declaring our model with parameters - default and those declared in our hyperparameter space model = LGBMClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=args.max_depth, subsample=args.subsample).fit(x_train, y_train) # save model os.makedirs('./outputs/model', exist_ok=True) # files saved in the "outputs" folder are automatically uploaded into run history joblib.dump(model, './outputs/model/saved_model.joblib') accuracy = model.score(x_test, y_test) print(model) print(x_test.head(10)) run.log("Accuracy", np.float( accuracy)) #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx y_pred = model.predict(x_test) auc_weighted = roc_auc_score(y_test, y_pred, average='weighted') run.log("AUC_weighted", np.float(auc_weighted) ) #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx # creating a confusion matrix cm = confusion_matrix(y_test, y_pred) print(cm)
(tgt / '__init__.py').touch(exist_ok=False) paths_to_copy = list(src.glob(glob_pattern)) logger.info(f"Copying to {tgt} the following files: {str(paths_to_copy)}") for p in paths_to_copy: destpath = tgt / p.relative_to(src) destpath.parent.mkdir(parents=True, exist_ok=True) shutil.copy(p, destpath) def is_offline_run(run: Run) -> bool: return run.id.startswith("OfflineRun") # Get the current run. RUN = Run.get_context() from cgmml.common.evaluation.eval_utilities import ( # noqa: E402, F401 is_ensemble_evaluation, is_multiartifact_evaluation) from cgmml.common.evaluation.evaluation_classes import ( # noqa: E402, F401 Evaluation, EnsembleEvaluation, MultiartifactEvaluation) from cgmml.common.model_utils.run_initialization import OfflineRunInitializer, OnlineRunInitializer # noqa: E402 logging.basicConfig( level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s - %(pathname)s: line %(lineno)d' ) QA_CONFIG_MODULES = [ 'qa_config_weight', # takes 14min in CI
def on_init_end(self, args, state, control, **kwargs): if self.azureml_run is None and state.is_world_process_zero: self.azureml_run = Run.get_context()
import glob import os import pandas as pd from azureml.core.run import Run from lightning_base import set_seed from lightning_glue import GLUETransformer, parse_args from pytorch_lightning import Trainer from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.utilities import rank_zero_only # get the Azure ML run object child_run = Run.get_context() run = child_run.parent class AzureMLLogger(LightningLoggerBase): def __init__(self): super().__init__() @rank_zero_only def log_hyperparams(self, params): pass @rank_zero_only def log_metrics(self, metrics, step): for k, v in {**{"step": step}, **metrics}.items(): run.log(k, v) @property
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sales_model.h5", ) parser.add_argument( "--step_output", type=str, help=("output for passing data to next step") ) parser.add_argument( "--dataset_version", type=str, help=("dataset version") ) parser.add_argument( "--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered") ) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id") ) parser.add_argument( "--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation") ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() (train, test) = tts(df) # Train the model model = lstm_model(train, test) #Saving the model # model.save("sales_forecast_model.h5") # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, train, test) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step # model_output_path = "outputs/sales_forecast_model.pkl" os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) keras.models.save_model(model,model_output_path) print("Saved model in model_output_path") #print("printing output path: ") #print(model_output_path) #print("printing model name: ") #print(model_name) #joblib.dump(value=model, filename=model_output_path) #checkpoints = ModelCheckpoint(model_output_path, verbose=1, # save_best_only=False, # save_weights_only=True, mode='auto', period=0) #callbacks_list = [checkpoints] #model.save(model) #model.save(model_output_path) #model.save('sales_model.pb') # new lines added ---------------------------- # serialize model to JSON # model_json = model.to_json() #with open("model.json", "w") as json_file: # json_file.write(model_json) # serialize weights to HDF5 #model_output_path= model.save_weights("model.h5") #print("Saved model to disk") #-------------------------------------- # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) keras.models.save_model(model,output_path) print("Model saved") #print("printing output path: ") #print(output_path) #checkpoints = ModelCheckpoint(output_path, verbose=1, # save_best_only=False, # save_weights_only=True, mode='auto', period=0) # serialize model to JSON #model_json = model.to_json() #with open("model.json", "w") as json_file: # json_file.write(model_json) # serialize weights to HDF5 #model.save_weights("model.h5") #print("Saved model to disk") #callbacks_list = [checkpoints] #model.save('output_path') #model.save('sales_model.pb') #model.save(model) # joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--model-name', type=str, default='diabetes_model.pkl', help='the name of the model') arg('--dataset-name', type=str, help='dataset_name is always required to get the exact version \ of dataset wanted instead of the one during pipeline creation') arg('--step-output', type=str, help='output of data passing to next step') arg('--dataset-version', type=str, help='wanted dataset version') arg('--dataset-file-path', type=str, help='new dataset to register') args = parser.parse_args() arg_dict = { 'model_name': args.model_name, 'datset_name': args.dataset_name, 'step_output': args.step_output, 'dataset_version': args.dataset_version } pprint.pprint(arg_dict) model_name = args.model_name dataset_name = args.dataset_name step_output = args.step_output dataset_file_path = args.dataset_file_path dataset_version = args.dataset_version run = Run.get_context() with open('parameters.json') as f: pars = json.load(f) try: train_args = pars['training'] except KeyError: print('training key is not found!') train_args = {} print(f'training params:{train_args}') for key, value in train_args.items(): run.log(key, value) run.parent.log(key, value) if dataset_name: if dataset_file_path == 'none': dataset = Dataset.get_by_name(workspace=run.experiment.workspace, name=dataset_name, version=dataset_version) else: dataset = register_dataset( workspace=run.experiment.workspace, datastore_name=os.environ.get('DATASTORE_NAME'), dataset_name=dataset_name, file_path=dataset_file_path) else: raise Exception('No dataset is provided') run.input_datasets['training_data'] = dataset run.parent.tag('dataset_id', dataset.id) df = dataset.to_pandas_dataframe() data = split_data(df) model = train_model(data, train_args) metrics = get_model_metrics(model, data) for key, value in metrics.items(): run.log(key, value) run.parent.log(key, value) os.makedirs(step_output, exist_ok=True) model_path = os.path.join(step_output, model_name) joblib.dump(value=model, filename=model_path) os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag('run_type', value='train') run.complete()
from sklearn.datasets import load_diabetes from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from azureml.core.run import Run from sklearn.externals import joblib import os import numpy as np import mylib os.makedirs('./outputs', exist_ok=True) X, y = load_diabetes(return_X_y=True) run = Run.get_context() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} # list of numbers from 0.0 to 1.0 with a 0.05 interval alphas = mylib.get_alphas() for alpha in alphas: # Use Ridge algorithm to create a regression model reg = Ridge(alpha=alpha) reg.fit(data["train"]["X"], data["train"]["y"])
def train(config, evaluate_only=False, outdir=".", detail=False, azureml=False): filename = config.model.filename categories_file = config.dataset.categories wav_directory = config.dataset.path batch_size = config.training.batch_size hidden_units = config.model.hidden_units architecture = config.model.architecture num_layers = config.model.num_layers use_gpu = config.training.use_gpu run = None if azureml: from azureml.core.run import Run run = Run.get_context() if run is None: print("### Run.get_context() returned None") else: print("### Running in Azure Context") valid_layers = [1, 2, 3] if num_layers not in valid_layers: raise Exception( "--num_layers can only be one of these values {}".format( valid_layers)) if not os.path.isdir(outdir): os.makedirs(outdir) if not filename: filename = "{}{}KeywordSpotter.pt".format(architecture, hidden_units) config.model.filename = filename # load the featurized data if not os.path.isdir(wav_directory): print("### Error: please specify valid --dataset folder location: {}". format(wav_directory)) sys.exit(1) if not categories_file: categories_file = os.path.join(wav_directory, "categories.txt") with open(categories_file, "r") as f: keywords = [x.strip() for x in f.readlines()] training_file = os.path.join(wav_directory, "training_list.npz") testing_file = os.path.join(wav_directory, "testing_list.npz") validation_file = os.path.join(wav_directory, "validation_list.npz") if not os.path.isfile(training_file): print("Missing file {}".format(training_file)) print("Please run make_datasets.py") sys.exit(1) if not os.path.isfile(validation_file): print("Missing file {}".format(validation_file)) print("Please run make_datasets.py") sys.exit(1) if not os.path.isfile(testing_file): print("Missing file {}".format(testing_file)) print("Please run make_datasets.py") sys.exit(1) model = None device = torch.device("cpu") if use_gpu: if torch.cuda.is_available(): device = torch.device("cuda") else: print("### CUDA not available!!") print("Loading {}...".format(testing_file)) test_data = AudioDataset(testing_file, config.dataset, keywords) log = None if not evaluate_only: print("Loading {}...".format(training_file)) training_data = AudioDataset(training_file, config.dataset, keywords, training=True) print("Loading {}...".format(validation_file)) validation_data = AudioDataset(validation_file, config.dataset, keywords) if training_data.mean is not None: fname = os.path.join(outdir, "mean.npy") print("Saving {}".format(fname)) np.save(fname, training_data.mean) fname = os.path.join(outdir, "std.npy") print("Saving {}".format(fname)) np.save(fname, training_data.std) # use the training_data mean and std variation test_data.mean = training_data.mean test_data.std = training_data.std validation_data.mean = training_data.mean validation_data.std = training_data.std print("Training model {}".format(filename)) model = create_model(config.model, training_data.input_size, training_data.num_keywords) if device.type == 'cuda': model.cuda() # move the processing to GPU start = time.time() log = model.fit(training_data, validation_data, config.training, config.model.sparsify, device, detail, run) end = time.time() passed, total, rate = model.evaluate(training_data, batch_size, device) print("Training accuracy = {:.3f} %".format(rate * 100)) torch.save(model.state_dict(), os.path.join(outdir, filename)) print( "Evaluating {} keyword spotter using {} rows of featurized test audio..." .format(architecture, test_data.num_rows)) if model is None: msg = "Loading trained model with input size {}, hidden units {} and num keywords {}" print( msg.format(test_data.input_size, hidden_units, test_data.num_keywords)) model = create_model(config.model, test_data.input_size, test_data.num_keywords) model.load_dict(torch.load(filename)) if model and device.type == 'cuda': model.cuda() # move the processing to GPU results_file = os.path.join(outdir, "results.txt") passed, total, rate = model.evaluate(test_data, batch_size, device, results_file) print("Testing accuracy = {:.3f} %".format(rate * 100)) if not evaluate_only: name = os.path.splitext(filename)[0] + ".onnx" print("saving onnx file: {}".format(name)) model.export(os.path.join(outdir, name), device) config.dataset.sample_rate = test_data.sample_rate config.dataset.input_size = test_data.audio_size config.dataset.num_filters = test_data.input_size config.dataset.window_size = test_data.window_size config.dataset.shift = test_data.shift logdata = { "accuracy_val": rate, "training_time": end - start, "log": log } d = TrainingConfig.to_dict(config) logdata.update(d) logname = os.path.join(outdir, "train_results.json") save_json(logdata, logname) return rate, log