def project_to_latent_space(vggface_path, pca_path): """ Part 2a of the paper where we project VGG-face extracted features to latent dimensional space using PCA """ X = np.load(os.path.join(vggface_path, 'X.npy')) nb_components = 40 pca = PCA(nb_components).fit(X) X_transformed = pca.transform(X) np.save(os.path.join(pca_path, 'X_latent.npy'), X_transformed) # save model to run run = Run.get_submitted_run() model_filepath = os.path.join("outputs", 'pca.pkl') joblib.dump(pca, model_filepath) run.register_model(model_name="pca", model_path=model_filepath) # track explained variance per number of principal components in run exp_variance = np.cumsum( np.round(pca.explained_variance_ratio_, decimals=4) * 100) run = Run.get_submitted_run() run.log_list('Explained variance', exp_variance.tolist())
def _get_automl_settings(automl_settings, logger): automl_settings_obj = None current_run = Run.get_submitted_run() found_data_store = False data_store = None start = time.time() try: experiment = current_run.experiment parent_run_id = _get_parent_run_id(current_run._run_id) print("parent run id {}".format(parent_run_id)) automl_settings_obj = _AutoMLSettings.from_string_or_dict(automl_settings) data_store = experiment.workspace.get_default_datastore() found_data_store = True except Exception as e: logger.warning("getting data store, fallback to default {}".format(e)) print("failed to get default data store {}".format(e)) found_data_store = False end = time.time() print("Caching supported {}, time taken for get default DS {}".format(sdk_has_cache_capability and found_data_store, (end - start))) return automl_settings_obj, found_data_store, data_store
def fit_classifier(vggface_path, pca_path, output_path): """ Part 2b of the paper where we fit classifer on latent matrix """ X = np.load(os.path.join(pca_path, 'X_latent.npy')) y = np.load(os.path.join(vggface_path, 'y.npy')) run = Run.get_submitted_run() knn = KNeighborsClassifier() cv_results = cross_validate(knn, X, y, scoring=['accuracy'], cv=5, verbose=1, return_train_score=True, n_jobs=1) # track train accuracy in run train_accuracy = round(np.mean(cv_results['train_accuracy']), 2) run.log("mean training accuracy", train_accuracy) test_accuracy = round(np.mean(cv_results['test_accuracy']), 2) run.log("mean testing accuracy", test_accuracy) # register model to run model_filepath = os.path.join(output_path, 'classifier.pkl') joblib.dump(knn, model_filepath) run.register_model(model_name="classifier", model_path=model_filepath)
def main(): # get command-line arguments parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, help='directory of training data') parser.add_argument('--test_dir', type=str, help='directory of test data', default=None) parser.add_argument('--output_dir', type=str, help='output directory', default="./outputs") parser.add_argument('--classifier', type=str, default='svm') parser.add_argument('--number_of_samples', type=int, help='Amount of training samples', default="400") parser.add_argument('--color_insensitive', type=int, help='True if color may not be a feature', default=0) parser.add_argument('--shape', help='the width and height e.g. (128,128)', type=int, default='64') parser.add_argument('--fbeta_beta', type=float, default='0.5') parser.add_argument('--is_local', type=bool, default=False) args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) print(f'scikit-learn version: {sklearn.__version__}') print("data directory is: " + args.data_dir) shape = (args.shape, args.shape) run = None if not args.is_local: run = Run.get_submitted_run() run.log('fbeta_beta', args.fbeta_beta) run.log('classifier', args.classifier) run.log('shape', args.shape) run.log('number_of_samples', args.number_of_samples) run.log('shape_and_samples', int(str(args.number_of_samples) + str(args.shape))) run.tag('color_insensitive', str(args.color_insensitive)) run.log('color_insensitive', args.color_insensitive) run.log('data_dir', args.data_dir) run.log('test_dir', args.test_dir) train(args.data_dir, args.test_dir, args.classifier, args.number_of_samples, shape=shape, output_directory=args.output_dir, beta=args.fbeta_beta, color_insensitive=args.color_insensitive, is_local=args.is_local, run=run)
def setup_wrapper(script_directory, dataprep_json, entry_point, automl_settings, task_type, preprocess, enable_subsampling, num_iterations, **kwargs): automl_settings_obj = _AutoMLSettings.from_string_or_dict( automl_settings) logger, sdk_has_custom_dimension_logger = _init_logger( automl_settings_obj) try: child_run_id = Run.get_submitted_run()._run_id parent_run_id = _get_parent_run_id(child_run_id) if sdk_has_custom_dimension_logger: logger.update_default_properties({ "parent_run_id": parent_run_id, "child_run_id": child_run_id }) logger.info("[ParentRunId:{}]: remote setup script begins.".format( parent_run_id)) script_directory = _init_directory(directory=script_directory, logger=logger) logger.info("Preparing data for set problem info now.") fit_iteration_parameters_dict = _prepare_data( dataprep_json=dataprep_json, automl_settings_obj=automl_settings_obj, script_directory=script_directory, entry_point=entry_point, logger=logger) fit_iteration_parameters_dict = _get_auto_cv_dict( fit_iteration_parameters_dict, automl_settings_obj, logger) print("Setting Problem Info now.") _set_problem_info_for_setup( fit_iteration_parameters_dict=fit_iteration_parameters_dict, automl_settings_obj=automl_settings_obj, task_type=task_type, preprocess=preprocess, enable_subsampling=enable_subsampling, num_iterations=num_iterations, logger=logger) except Exception as e: logger.error("setup_wrapper meets exceptions. {}".format(e)) log_traceback(e, logger) raise Exception(e) _post_setup(logger=logger) logger.info("[ParentRunId:{}]: remote setup script finishes.".format( parent_run_id)) return # PLACEHOLDER for RemoteScript helper functions
def main(): # get command-line arguments parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, help='directory of training data') parser.add_argument('--test_dir', type=str, help='directory of test data', default=None) parser.add_argument('--output_dir', type=str, help='output directory', default="./outputs") parser.add_argument('--classifier', type=str, default='svm') parser.add_argument('--number_of_samples', help='the width and height e.g. (128,128)', type=int, default='320') parser.add_argument('--fbeta_beta', type=float, default='0.5') parser.add_argument('--is_local', type=bool, default=False) args = parser.parse_args() with open(f'{args.output_dir}/results.csv', 'w') as csvfile: fieldnames = ['number_of_samples', 'shape', 'color_insensitive', 'accuracy', 'f_score'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() run = None if not args.is_local: run = Run.get_submitted_run() run.log('data_dir', args.data_dir) run.log('test_dir', args.test_dir) run.log('fbeta_beta', args.fbeta_beta) os.makedirs(args.output_dir, exist_ok=True) print(f'scikit-learn version: {sklearn.__version__}') print("data directory is: " + args.data_dir) for shape in ['32', '64', '128', '256']: shape = (int(shape), int(shape)) for color_insensitive in [0, 1]: start = time.time() X, y = get_data(args.data_dir, args.number_of_samples, shape, color_insensitive) loading_time = time.time() - start log(run, 'data_loading_time', loading_time) for number_of_samples in [args.number_of_samples/8, args.number_of_samples/4, args.number_of_samples/2, args.number_of_samples]: print(number_of_samples, shape, color_insensitive) X_sliced, y_sliced = shuffle(X, y, n_samples=int(number_of_samples)) X_train, X_test, y_train, y_test = train_test_split(X_sliced, y_sliced, test_size=0.3, random_state=42) train(X_train, y_train, X_test, y_test, args.classifier, number_of_samples=int(number_of_samples), shape=shape, output_directory=args.output_dir, beta=args.fbeta_beta, color_insensitive=color_insensitive, is_local=args.is_local, run=run)
def _set_problem_info_for_setup(fit_iteration_parameters_dict, automl_settings_obj, task_type, preprocess, enable_subsampling, num_iterations, logger): current_run = Run.get_submitted_run() logger.info("Start to set problem info for the setup for run id {}.".format(current_run._run_id)) logger.info("Setup experiment.") try: experiment = current_run.experiment parent_run_id = _get_parent_run_id(current_run._run_id) data_store = experiment.workspace.get_default_datastore() found_data_store = True logger.info("Using data store.") except Exception as e: logger.warning("Getting data store, fallback to default {}".format(e)) found_data_store = False logger.info("Caching supported {}.".format(sdk_has_cache_capability and found_data_store)) print("caching supported {}".format(sdk_has_cache_capability and found_data_store)) if sdk_has_validate_data_dict: # The newest version of validate_training_data_dict should contains check_x_y logger.info("Using validate_training_data_dict now.") validate_training_data_dict(data_dict=fit_iteration_parameters_dict, automl_settings=automl_settings_obj) else: logger.info("Using validate_training_data now.") validate_training_data(X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get('sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'), cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings=automl_settings_obj) check_x_y(fit_iteration_parameters_dict.get('X'), fit_iteration_parameters_dict.get('y'), automl_settings_obj) if sdk_has_cache_capability and found_data_store: data_splits_validated = True try: start = time.time() transformed_data_context = _get_transformed_data_context( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get('sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'), x_raw_column_names=fit_iteration_parameters_dict.get('x_raw_column_names'), cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings_obj=automl_settings_obj, data_store=data_store, run_target='remote', parent_run_id=parent_run_id, logger=logger ) end = time.time() print("time taken for transform {}".format(end-start)) logger.info("time taken for transform {}".format(end-start)) if sdk_has_validate_data_splits: try: logger.info("Validating data splits now.") _validate_data_splits(X=transformed_data_context.X, y=transformed_data_context.y, X_valid=transformed_data_context.X_valid, y_valid=transformed_data_context.y_valid, cv_splits=transformed_data_context.cv_splits, automl_settings=automl_settings_obj) data_splits_validated = True except Exception as data_split_exception: data_splits_validated = False logger.error("Meeting validation errors {}.".format(data_split_exception)) log_traceback(data_split_exception, logger) raise data_split_exception logger.info("Start setting problem info.") automl.set_problem_info(transformed_data_context.X, transformed_data_context.y, automl_settings_obj.task_type, current_run=current_run, preprocess=automl_settings_obj.preprocess, lag_length=automl_settings_obj.lag_length, transformed_data_context=transformed_data_context, enable_cache=automl_settings_obj.enable_cache, subsampling=enable_subsampling) except Exception as e: if sdk_has_validate_data_splits and not data_splits_validated: logger.error("sdk_has_validate_data_splits is True and data_splits_validated is False {}.".format(e)) log_traceback(e, logger) raise e else: logger.warning("Setup failed, fall back to old model {}".format(e)) print("Setup failed, fall back to old model {}".format(e)) automl.set_problem_info( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling ) else: logger.info("Start setting problem info using old model.") if sdk_has_validate_data_splits: _validate_data_splits(X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), cv_splits=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings=automl_settings_obj) automl.set_problem_info( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling )
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj['datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values except ImportError: logger.info("SDK version does not support column names extraction, fallback to old path") fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X) try: fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e): logger.debug("User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
os.environ["AZUREML_SERVICE_ENDPOINT"] = parameters["SERVICE_ENDPOINT"] return setup_run() def setup_run(): global script_directory setup_wrapper( script_directory=script_directory, dataprep_json=dataprep_json, entry_point=entry_point, automl_settings=automl_settings, task_type=task_type, preprocess=preprocess, enable_subsampling=enable_subsampling, num_iterations=num_iterations ) return "Setup run completed successfully!" if __name__ == '__main__': try: result = setup_run() except Exception as e: errors = {'errors': {'exception': e, 'traceback': traceback.format_exc()}} try: current_run = Run.get_submitted_run() current_run.add_properties(errors) except Exception: pass raise print(result)
from __future__ import print_function, division import torch import torch.nn as nn import torch.optim as optim from torch.optim import lr_scheduler from torchvision import datasets, models, transforms import numpy as np import time import os import copy import argparse from azureml.core.run import Run # get the Azure ML run object run = Run.get_submitted_run() def load_data(data_dir): """Load the train/val data.""" # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]),
def get_logger(): try: return Run.get_submitted_run() except: return LocalLogger()
def train(): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, fake_data=FLAGS.fake_data) sess = tf.InteractiveSession() # Create a multilayer model. # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, 784], name='x-input') y_ = tf.placeholder(tf.int64, [None], name='y-input') with tf.name_scope('input_reshape'): image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) tf.summary.image('input', image_shaped_input, 10) # We can't initialize these variables to 0 - the network will get stuck. def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer. It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. It also sets up name scoping so that the resultant graph is easy to read, and adds a number of summary ops. """ # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope('weights'): weights = weight_variable([input_dim, output_dim]) variable_summaries(weights) with tf.name_scope('biases'): biases = bias_variable([output_dim]) variable_summaries(biases) with tf.name_scope('Wx_plus_b'): preactivate = tf.matmul(input_tensor, weights) + biases tf.summary.histogram('pre_activations', preactivate) activations = act(preactivate, name='activation') tf.summary.histogram('activations', activations) return activations hidden1 = nn_layer(x, 784, 500, 'layer1') with tf.name_scope('dropout'): keep_prob = tf.placeholder(tf.float32) tf.summary.scalar('dropout_keep_probability', keep_prob) dropped = tf.nn.dropout(hidden1, keep_prob) # Do not apply softmax activation yet, see below. y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) with tf.name_scope('cross_entropy'): # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the # raw logit outputs of the nn_layer above, and then average across # the batch. with tf.name_scope('total'): cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer( FLAGS.learning_rate).minimize(cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), y_) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # Merge all the summaries and write them out to # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') tf.global_variables_initializer().run() saver = tf.train.Saver() # Train the model, and also write summaries. # Every 10th step, measure test-set accuracy, and write test summaries # All other steps, run train_step on training data, & add training summaries def feed_dict(train): """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) k = FLAGS.dropout else: xs, ys = mnist.test.images, mnist.test.labels k = 1.0 return {x: xs, y_: ys, keep_prob: k} for i in range(FLAGS.max_steps): if i % 10 == 0: # Record summaries and test-set accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print('Accuracy at step %s: %s' % (i, acc)) if i % 50 == 0: Run.get_submitted_run().log('Accuracy_test', acc) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) train_writer.add_summary(summary, i) train_writer.close() test_writer.close() Run.get_submitted_run().log('Accuracy', acc) os.makedirs('./outputs/model', exist_ok=True) saver.save(sess, './outputs/model/mnist.model')
type=bool, default=False, help='If true, uses fake data for unit testing.') parser.add_argument('--max_steps', type=int, default=1000, help='Number of steps to run trainer.') parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate') parser.add_argument('--dropout', type=float, default=0.9, help='Keep probability for training dropout.') parser.add_argument('--data_dir', type=str, default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), 'tensorflow/mnist/input_data'), help='Directory for storing input data') parser.add_argument('--log_dir', type=str, default=os.path.join( os.getenv('TEST_TMPDIR', '/tmp'), 'tensorflow/mnist/logs/mnist_with_summaries'), help='Summaries log directory') FLAGS, unparsed = parser.parse_known_args() Run.get_submitted_run().log('learning_rate', FLAGS.learning_rate) Run.get_submitted_run().log('dropout', FLAGS.dropout) tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
def main(unused_argv): data_root = os.path.join("outputs", "MNIST") mnist = None tf_config = os.environ.get("TF_CONFIG") if not tf_config or tf_config == "": raise ValueError("TF_CONFIG not found.") tf_config_json = json.loads(tf_config) cluster = tf_config_json.get('cluster') job_name = tf_config_json.get('task', {}).get('type') task_index = tf_config_json.get('task', {}).get('index') job_name = "worker" if job_name == "master" else job_name sentinel_path = os.path.join(data_root, "complete.txt") if job_name == "worker" and task_index == 0: mnist = input_data.read_data_sets(data_root, one_hot=True) with open(sentinel_path, 'w+') as f: f.write("download complete") else: while not os.path.exists(sentinel_path): time.sleep(0.01) mnist = input_data.read_data_sets(data_root, one_hot=True) if FLAGS.download_only: sys.exit(0) print("job name = %s" % job_name) print("task index = %d" % task_index) print("number of GPUs = %d" % FLAGS.num_gpus) # Construct the cluster and start the server cluster_spec = tf.train.ClusterSpec(cluster) # Get the number of workers. num_workers = len(cluster_spec.task_indices("worker")) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server(cluster_spec, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() is_chief = (task_index == 0) if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter(worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): global_step = tf.Variable(0, name="global_step", trainable=False) # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # Ops: located on the worker specified with task_index x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) y_ = tf.placeholder(tf.float32, [None, 10]) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) cross_entropy = -tf.reduce_sum( y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_step = opt.minimize(cross_entropy, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp() if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor(is_chief=is_chief, logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % task_index) else: print("Worker %d: Waiting for session to be initialized..." % task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + task_index print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 while True: # Training feed batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys} _, step = sess.run([train_step, global_step], feed_dict=train_feed) local_step += 1 now = time.time() print("%f: Worker %d: training step %d done (global step: %d)" % (now, task_index, local_step, step)) if step >= FLAGS.train_steps: break time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) # Validation feed val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} val_xent = sess.run(cross_entropy, feed_dict=val_feed) print("After %d training step(s), validation cross entropy = %g" % (FLAGS.train_steps, val_xent)) if job_name == "worker" and task_index == 0: run = Run.get_submitted_run() run.log("CrossEntropy", val_xent)