Exemplo n.º 1
0
def train_and_evaluate(args):
    iris_model = model.model_fn()
    # Train the model
    from sklearn.datasets import load_iris
    iris_data = load_iris() # load the iris dataset
    iris_data.data
    _X = iris_data.data
    _y = iris_data.target

    X = _X
    from sklearn.preprocessing import OneHotEncoder 
    ohe = OneHotEncoder()
    y = ohe.fit_transform(np.reshape(_y, (-1, 1))).toarray()

    # which is importance for convergence of the neural network
    #scaler = StandardScaler()
    #X = scaler.fit_transform(_X)

    # Split the data set into training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=2)
    iris_model.fit(X_train, y_train, verbose=1, batch_size=5, epochs=50)

    # Save model.h5 on to google storage
    #iris_model.save('model.pb')
    #with file_io.FileIO('model.pb', mode='rb') as input_f:
    #    with file_io.FileIO(args.job_dir  + '/model.pb', mode='w+') as output_f:
    #        output_f.write(input_f.read())
    iris_model.save(os.path.join(args.job_dir, 'export'))
Exemplo n.º 2
0
def train_model(args):
    mnist_model = model.model_fn(args.learning_rate)
    train_dataset, eval_dataset = model.get_dataset(args.train_batch_size)

    history = mnist_model.fit(train_dataset, epochs=args.num_epochs)

    eval_loss, eval_acc = mnist_model.evaluate(eval_dataset)
    print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))
Exemplo n.º 3
0
def train_and_evaluate(args):
  INPUT_DIM = args.training_history*ONE_HOUR
  CLASS_SIZE = len(bins)+1
  hidden_units = args.hidden_units
  # hidden_units = [int(units) for units in args.hidden_units.split(',')]
  learning_rate = args.learning_rate
  disk_model = model.model_fn(INPUT_DIM, CLASS_SIZE,
                              hidden_units,learning_rate
                              )
  try:
    os.makedirs(args.job_dir)
  except:
    pass

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  checkpoint_path = CHECKPOINT_FILE_PATH
  if not args.job_dir.startswith('gs://'):
    checkpoint_path = os.path.join(args.job_dir, checkpoint_path)

  # Model checkpoint callback.
  checkpoint = ModelCheckpoint(
      checkpoint_path,
      monitor='val_loss',
      verbose=1,
      period=args.checkpoint_epochs,
      mode='min')

  # Continuous eval callback.
  # evaluation = ContinuousEval(args.eval_frequency, args.eval_files,
	# 														args.learning_rate, args.job_dir)

  # Tensorboard logs callback.
  tb_log = TensorBoard(
      log_dir=os.path.join(args.job_dir, 'logs'),
      histogram_freq=0,
      write_graph=True,
      embeddings_freq=0)

  callbacks = [checkpoint,tb_log]

  history = disk_model.fit_generator(
      model.generator_input(args.train_files, args.training_history,args.train_batch_size),
      validation_data=model.generator_input(args.eval_files, args.training_history,args.eval_batch_size),
      steps_per_epoch=args.train_steps,validation_steps = 10,
      epochs=args.num_epochs,
      callbacks=callbacks)

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  if args.job_dir.startswith('gs://'):
    disk_model.save(DISK_MODEL)
    copy_file_to_gcs(args.job_dir, DISK_MODEL)
  else:
    disk_model.save(os.path.join(args.job_dir, DISK_MODEL))
  with file_io.FileIO(
          os.path.join(args.job_dir, 'history'), mode='w+') as output_f:
      pickle.dump(history.history, output_f)
Exemplo n.º 4
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             first_layer_size, num_layers, scale_factor, eval_num_epochs,
             num_epochs, checkpoint_epochs):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = ModelCheckpoint(checkpoint_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 period=checkpoint_epochs,
                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate,
                                job_dir)

    # Tensorboard logs callback
    tblog = TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                        histogram_freq=0,
                        write_graph=True)

    callbacks = [checkpoint, evaluation, tblog]

    census_model.fit_generator(model.generator_input(train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=train_steps,
                               epochs=num_epochs,
                               callbacks=callbacks)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if job_dir.startswith("gs://"):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    if os.path.exists(os.path.join(job_dir, 'export')):
        shutil.rmtree(os.path.join(job_dir, 'export'))

    model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
Exemplo n.º 5
0
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units):
    """Given the latest checkpoint file export the saved model.

  Args:
    latest (string): Latest checkpoint file
    job_dir (string): Location of checkpoints and model files
    name (string): Name of the checkpoint to be exported. Used in building the
      export path.
    hidden_units (list): Number of hidden units
    learning_rate (float): Learning rate for the SGD
  """

    prediction_graph = tf.Graph()
    exporter = tf.saved_model.builder.SavedModelBuilder(
        os.path.join(job_dir, 'export'))
    with prediction_graph.as_default():
        features, inputs_dict = serving_input_fn()
        prediction_dict = model.model_fn(
            model.PREDICT,
            features.copy(),
            None,  # labels
            hidden_units=hidden_units,
            learning_rate=None  # learning_rate unused in prediction mode
        )
        saver = tf.train.Saver()

        inputs_info = {
            name: tf.saved_model.utils.build_tensor_info(tensor)
            for name, tensor in six.iteritems(inputs_dict)
        }
        output_info = {
            name: tf.saved_model.utils.build_tensor_info(tensor)
            for name, tensor in six.iteritems(prediction_dict)
        }
        signature_def = tf.saved_model.signature_def_utils.build_signature_def(
            inputs=inputs_info,
            outputs=output_info,
            method_name=sig_constants.PREDICT_METHOD_NAME)

    with tf.Session(graph=prediction_graph) as session:
        session.run(
            [tf.local_variables_initializer(),
             tf.tables_initializer()])
        saver.restore(session, latest)
        exporter.add_meta_graph_and_variables(
            session,
            tags=[tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
            },
            legacy_init_op=main_op())

    exporter.save()
Exemplo n.º 6
0
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units):
  """Given the latest checkpoint file export the saved model.

  Args:
    latest (str): Latest checkpoint file.
    job_dir (str): Location of checkpoints and model files.
    serving_input_fn (str): Serving Function
    hidden_units (list): Number of hidden units.
  """

  prediction_graph = tf.Graph()
  # Create exporter.
  exporter = tf.saved_model.builder.SavedModelBuilder(
      os.path.join(job_dir, 'export'))
  with prediction_graph.as_default():
    features, inputs_dict = serving_input_fn()
    prediction_dict = model.model_fn(
        model.PREDICT,
        features.copy(),
        None,  # labels
        hidden_units=hidden_units,
        learning_rate=None  # learning_rate unused in prediction mode
    )
    saver = tf.train.Saver()

    inputs_info = {
        name: tf.saved_model.utils.build_tensor_info(tensor)
        for name, tensor in six.iteritems(inputs_dict)
    }
    output_info = {
        name: tf.saved_model.utils.build_tensor_info(tensor)
        for name, tensor in six.iteritems(prediction_dict)
    }
    signature_def = tf.saved_model.signature_def_utils.build_signature_def(
        inputs=inputs_info,
        outputs=output_info,
        method_name=sig_constants.PREDICT_METHOD_NAME
    )

  with tf.Session(graph=prediction_graph) as session:
    session.run([tf.local_variables_initializer(), tf.tables_initializer()])
    saver.restore(session, latest)
    exporter.add_meta_graph_and_variables(
        session,
        tags=[tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
        },
        legacy_init_op=main_op()
    )
  exporter.save()
Exemplo n.º 7
0
def train_and_evaluate(args):
  census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)
  try:
    os.makedirs(args.job_dir)
  except:
    pass

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  checkpoint_path = CHECKPOINT_FILE_PATH
  if not args.job_dir.startswith('gs://'):
    checkpoint_path = os.path.join(args.job_dir, checkpoint_path)

  # Model checkpoint callback.
  checkpoint = ModelCheckpoint(
      checkpoint_path,
      monitor='val_loss',
      verbose=1,
      period=args.checkpoint_epochs,
      mode='min')

  # Continuous eval callback.
  evaluation = ContinuousEval(args.eval_frequency, args.eval_files,
															args.learning_rate, args.job_dir)

  # Tensorboard logs callback.
  tb_log = TensorBoard(
      log_dir=os.path.join(args.job_dir, 'logs'),
      histogram_freq=0,
      write_graph=True,
      embeddings_freq=0)

  callbacks = [checkpoint, evaluation, tb_log]

  census_model.fit_generator(
      model.generator_input(args.train_files, chunk_size=CHUNK_SIZE),
      steps_per_epoch=args.train_steps,
      epochs=args.num_epochs,
      callbacks=callbacks)

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  if args.job_dir.startswith('gs://'):
    census_model.save(CENSUS_MODEL)
    copy_file_to_gcs(args.job_dir, CENSUS_MODEL)
  else:
    census_model.save(os.path.join(args.job_dir, CENSUS_MODEL))

  # Convert the Keras model to TensorFlow SavedModel.
  model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export'))
Exemplo n.º 8
0
def dispatch(train_files, eval_files, job_dir, learning_rate, eval_frequency,
             num_epochs, checkpoint_epochs):

    # setting the seed for reproducibility
    np.random.seed(13)

    forecast_model = model.model_fn()

    scaler = model.build_scaler(train_files + eval_files)

    try:
        os.makedirs(job_dir)
    except Exception as e:
        print(e)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 verbose=1,
                                                 period=checkpoint_epochs)

    # Continuous eval callback
    with ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir,
                        scaler) as evaluation:

        # Tensorboard logs callback
        tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(
            job_dir, 'logs'),
                                            histogram_freq=0,
                                            write_graph=True,
                                            embeddings_freq=0)

        callbacks = [checkpoint, evaluation, tblog]

        x, y = model.load_features(train_files, scaler)
        forecast_model.fit(x, y, epochs=num_epochs, callbacks=callbacks)

        # Unhappy hack to work around h5py not being able to write to GCS.
        # Force snapshots and saves to local filesystem, then copy them over to GCS.
        if job_dir.startswith("gs://"):
            forecast_model.save(MODEL_FILENAME)
            copy_file_to_gcs(job_dir, MODEL_FILENAME)
        else:
            forecast_model.save(os.path.join(job_dir, MODEL_FILENAME))
Exemplo n.º 9
0
def train_evaluate(model_name, hidden_units, train_file, valid_file, ckpt_folder, optimizer, batch_size, max_steps, lr, eval_steps):
    
    estimator = model_fn(model_name, hidden_units, ckpt_folder, optimizer, lr)
    
    train_input_fn = lambda: input_fn(file=train_file, batch_size=batch_size, train=True)
    valid_input_fn = lambda: input_fn(file=valid_file, batch_size=batch_size, train=False)

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps)
    
    export_latest = tf.estimator.FinalExporter("bclassifier", serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(input_fn=valid_input_fn, 
                                      steps=eval_steps,
                                      exporters=export_latest)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemplo n.º 10
0
def train_and_evaluate(args):
    logic_nn_model = model.model_fn(**vars(args))
    try:
        os.makedirs(args.job_dir)
    except:
        pass

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_FILE_PATH
    if not args.job_dir.startswith('gs://'):
        checkpoint_path = os.path.join(args.job_dir, checkpoint_path)

    # Model checkpoint callback.
    checkpoint = ModelCheckpoint(checkpoint_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 period=args.checkpoint_epochs,
                                 mode='min')

    # Continuous eval callback.
    evaluation = ContinuousEval(args.eval_frequency, args.learning_rate,
                                args.job_dir, args.eval_steps)

    # Tensorboard logs callback.
    tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'),
                         histogram_freq=0,
                         write_graph=True,
                         embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tb_log]

    logic_nn_model.fit_generator(model.generator_input(),
                                 steps_per_epoch=args.train_steps,
                                 epochs=args.num_epochs,
                                 callbacks=callbacks)

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if args.job_dir.startswith('gs://'):
        logic_nn_model.save(LOGICAL_NN_MODEL)
        copy_file_to_gcs(args.job_dir, LOGICAL_NN_MODEL)
    else:
        logic_nn_model.save(os.path.join(args.job_dir, LOGICAL_NN_MODEL))

    # Convert the Keras model to TensorFlow SavedModel.
    model.to_saved_model(logic_nn_model, os.path.join(args.job_dir, 'export'))
    print("...Finished actions for build and train with model...")
Exemplo n.º 11
0
def train_and_evaluate(hparams):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)
    try:
        os.makedirs(hparams.job_dir)
    except:
        pass

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_FILE_PATH
    if not hparams.job_dir.startswith('gs://'):
        checkpoint_path = os.path.join(hparams.job_dir, checkpoint_path)

    # Model checkpoint callback.
    checkpoint = ModelCheckpoint(checkpoint_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 period=hparams.checkpoint_epochs,
                                 mode='min')

    # Continuous eval callback.
    evaluation = ContinuousEval(hparams.eval_frequency, hparams.eval_files,
                                hparams.learning_rate, hparams.job_dir)

    # Tensorboard logs callback.
    tb_log = TensorBoard(log_dir=os.path.join(hparams.job_dir, 'logs'),
                         histogram_freq=0,
                         write_graph=True,
                         embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tb_log]

    census_model.fit_generator(model.generator_input(hparams.train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=hparams.train_steps,
                               epochs=hparams.num_epochs,
                               callbacks=callbacks)

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if hparams.job_dir.startswith('gs://'):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(hparams.job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(hparams.job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel.
    model.to_savedmodel(census_model, os.path.join(hparams.job_dir, 'export'))
Exemplo n.º 12
0
def dispatch(data_file, job_dir, num_epochs):
    job_dir = create_job_dir(job_dir)
    nb_chars, embedding_matrix, x_train, y_train, x_val, y_val = \
        model.get_training_data(data_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT, EMBEDDING_FILE_GCS)
    my_model = model.model_fn(nb_chars, embedding_matrix)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 save_best_only=True,
                                                 mode='min')

    timestamp = str(time.time())

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, tblog]

    my_model = model.compile_model(my_model)
    my_model.fit(x_train,
                 y_train,
                 validation_data=(x_val, y_val),
                 epochs=num_epochs,
                 batch_size=128,
                 callbacks=callbacks)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if job_dir.startswith("gs://"):
        my_model.save(MY_MODEL_NAME)
        copy_file_to_gcs(job_dir, MY_MODEL_NAME)
    else:
        my_model.save(os.path.join(job_dir, MY_MODEL_NAME))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(my_model, os.path.join(job_dir, 'export'))
def train_and_evaluate(args):
  # Showcasing the hypertuning parameters here.
  # The first-layer-size is being tuned in this example
  hidden_units = [args.first_layer_size, 70, 50, 20]
  census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE, hidden_units)
  try:
    os.makedirs(args.job_dir)
  except:
    pass

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  checkpoint_path = CHECKPOINT_FILE_PATH
  if not args.job_dir.startswith('gs://'):
    checkpoint_path = os.path.join(args.job_dir, checkpoint_path)

  # Model checkpoint callback.
  checkpoint = ModelCheckpoint(
      checkpoint_path,
      monitor='val_loss',
      verbose=1,
      save_best_only=False,
      period=args.checkpoint_epochs,
      mode='min')

  # Continuous eval callback.
  evaluation = ContinuousEval(args.eval_frequency, args.eval_files,
															args.learning_rate, args.job_dir)

  # Tensorboard logs callback.
  tb_log = TensorBoard(
      log_dir=os.path.join(args.job_dir, 'logs'),
      histogram_freq=0,
      write_graph=True,
      embeddings_freq=0)

  callbacks = [checkpoint, evaluation, tb_log]

  census_model.fit_generator(
      model.generator_input(args.train_files, chunk_size=CHUNK_SIZE),
      steps_per_epoch=args.train_steps,
      epochs=args.num_epochs,
      use_multiprocessing=args.distributed,
      callbacks=callbacks)

  # Unhappy hack to workaround h5py not being able to write to GCS.
  # Force snapshots and saves to local filesystem, then copy them over to GCS.
  if args.job_dir.startswith('gs://'):
    census_model.save(CENSUS_MODEL)
    copy_file_to_gcs(args.job_dir, CENSUS_MODEL)
  else:
    census_model.save(os.path.join(args.job_dir, CENSUS_MODEL))

  # Convert the Keras model to TensorFlow SavedModel.
  model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export'))

  # The following is for hyperparameter tuning and is adapted from here: https://cloud.google.com/ml-engine/docs/tensorflow/using-hyperparameter-tuning
  # Note: the last_loss_val is updated after each checkpoint, but we only write the summary once.
  summary = Summary(value=[Summary.Value(tag='val_loss', simple_value=evaluation.last_loss_val)])

  # more hypertune info here: https://cloud.google.com/solutions/machine-learning/recommendation-system-tensorflow-train-cloud-ml-engine

  job_dir = args.job_dir

  if args.hypertune:
      # if tuning, join the trial number to the output path
      trial = json.loads(os.environ.get('TF_CONFIG', '{}')).get('task', {}).get('trial', '')
      output_dir = os.path.join(job_dir, trial)
  else:
      output_dir = job_dir

  eval_path = os.path.join(output_dir, 'val_loss')
  summary_writer = tf.summary.FileWriter(eval_path)

  # Note: adding the summary to the writer is enough for hyperparameter tuning.
  # ML Engine looks for any summary added with the hyperparameter metric tag.
  summary_writer.add_summary(summary)
  summary_writer.flush()
Exemplo n.º 14
0
def run(target, cluster_spec, is_chief, hparams):
    """Runs the training and evaluation graph.

  Args:
    target (str): Tensorflow server target.
    cluster_spec: (cluster spec) Cluster specification.
    is_chief (bool): Boolean flag to specify a chief server.
    hparams (tf.hparams): Input Arguments.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
        for i in range(hparams.num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info('Created DNN hidden units {}'.format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                hparams.eval_files,
                num_epochs=None if hparams.eval_steps else 1,
                batch_size=hparams.eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=hparams.learning_rate)

        hooks = [
            EvaluationRunHook(
                hparams.job_dir,
                metric_dict,
                evaluation_graph,
                hparams.eval_frequency,
                eval_steps=hparams.eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default.
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers.
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue.
            features, labels = model.input_fn(
                hparams.train_files,
                num_epochs=hparams.num_epochs,
                batch_size=hparams.train_batch_size)

            # Returns the training graph and global step tensor.
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=hparams.learning_rate)

        # Creates a MonitoredSession for training.
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=hparams.job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (hparams.train_steps is None or
                   step < hparams.train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(hparams.job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(
                latest_checkpoint, hparams.job_dir,
                model.SERVING_INPUT_FUNCTIONS[hparams.export_format],
                hidden_units)
Exemplo n.º 15
0
def train_and_evaluate(args):

    CLASS_SIZE = len(bins) + 1

    # hidden_units = [int(units) for units in args.hidden_units.split(',')]

    try:
        os.makedirs(args.job_dir)
    except:
        pass

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_FILE_PATH
    if not args.job_dir.startswith('gs://'):
        checkpoint_path = os.path.join(args.job_dir, checkpoint_path)

    # Model checkpoint callback.
    checkpoint = ModelCheckpoint(checkpoint_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 period=args.checkpoint_epochs,
                                 mode='min')

    tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'),
                         histogram_freq=0,
                         write_graph=True,
                         embeddings_freq=0)

    callbacks = [checkpoint, tb_log]
    sequential_train = [
        int(float(hour)) for hour in args.sequential_train.split(',')
    ]
    seq_id = 0
    sequential_models = []
    weights_all = []
    for hours in sequential_train:
        history_all = []

        if seq_id == 0:
            hidden_units = args.hidden_units
            learning_rate = args.learning_rate
            INPUT_DIM = hours * ONE_HOUR

            ###########fully connected model#############
            first_model = initial_model.model_fn(INPUT_DIM, CLASS_SIZE,
                                                 hidden_units, 0.0001)
            assign_w = 0.03
            first_model.compile(loss=initial_model.weighted_loss(assign_w),
                                optimizer=keras.optimizers.Adam(lr=0.0001),
                                metrics=[
                                    initial_model.first_class_accuracy,
                                    initial_model.other_class_accuracy,
                                    initial_model.single_class_accuracy(1),
                                    initial_model.single_class_accuracy(2),
                                    initial_model.single_class_accuracy(3),
                                    initial_model.single_class_accuracy(4),
                                    initial_model.single_class_accuracy(5),
                                    initial_model.single_class_accuracy(6),
                                    initial_model.single_class_accuracy(7),
                                    'accuracy'
                                ])
            train_file_names = os.path.join(args.train_files,
                                            str(hours) + 'hrs', 'train',
                                            '*npz')
            eval_file_names = os.path.join(args.eval_files,
                                           str(hours) + 'hrs', 'eval', '*npz')
            print("\n\ntraining " + str(hours) + 'hrs!\n\n')
            history_all.append(
                first_model.fit_generator(
                    initial_model.generator_input(train_file_names,
                                                  args.train_batch_size),
                    validation_data=initial_model.generator_input(
                        eval_file_names, args.eval_batch_size),
                    steps_per_epoch=args.train_steps,
                    validation_steps=args.eval_steps,
                    epochs=args.num_epochs,
                    callbacks=callbacks))

            weights = first_model.get_weights()
            weights_all.append(weights)
            with open(
                    os.path.join(args.job_dir, 'weights',
                                 str(hours) + 'hrs_weights'), 'wb') as fp:
                pickle.dump(weights, fp)
            DISK_MODEL = 'disk_model.hdf5'
            if args.job_dir.startswith('gs://'):
                first_model.save(DISK_MODEL)
                copy_file_to_gcs(args.job_dir, DISK_MODEL)
            else:
                first_model.save(os.path.join(args.job_dir, DISK_MODEL))
            data, label = initial_model.generator_input_once(
                str(args.train_files) + str(hours) + 'hrs/train/input_' +
                str(hours) + 'hrs_8.npz', 3)
            first_model.compile(loss=initial_model.weighted_loss(0.00081),
                                optimizer=keras.optimizers.Adam(lr=0.0001),
                                metrics=[
                                    initial_model.first_class_accuracy,
                                    initial_model.other_class_accuracy,
                                    initial_model.single_class_accuracy(1),
                                    initial_model.single_class_accuracy(2),
                                    initial_model.single_class_accuracy(3),
                                    initial_model.single_class_accuracy(4),
                                    initial_model.single_class_accuracy(5),
                                    initial_model.single_class_accuracy(6),
                                    initial_model.single_class_accuracy(7),
                                    'accuracy'
                                ])
            first_model.fit_generator(
                initial_model.generator_input(train_file_names,
                                              args.train_batch_size),
                validation_data=initial_model.generator_input(
                    eval_file_names, args.eval_batch_size),
                steps_per_epoch=args.train_steps,
                validation_steps=args.eval_steps,
                epochs=50,
                callbacks=callbacks)
            scores = first_model.evaluate(x=data,
                                          y=label,
                                          batch_size=None,
                                          verbose=1,
                                          sample_weight=None,
                                          steps=1)
            print("\ntest " + str(hours) + 'hrs after train\n')
            print(scores)
            seq_id = seq_id + 1

        else:
            with open(
                    os.path.join(
                        args.job_dir, 'weights',
                        str(sequential_train[seq_id - 1]) + 'hrs_weights'),
                    'rb') as fp:
                weights_0 = pickle.load(fp)
                ######sequential(weights, CONCAT_UNIT_SIZE, INPUT_SHAPE, learning_rate)
            hours = sequential_train[seq_id]
            seq = Sequential(weights_0, args.CONCAT_UNIT_SIZE,
                             hours * ONE_HOUR, 'zeros')
            model = seq.build_sequential_model()
            # assign_w = 0.016+0.005*seq_id
            assign_w = 0.03
            model.compile(loss=initial_model.weighted_loss(assign_w),
                          optimizer=keras.optimizers.Adam(lr=0.0001),
                          metrics=[
                              initial_model.first_class_accuracy,
                              initial_model.other_class_accuracy,
                              initial_model.single_class_accuracy(1),
                              initial_model.single_class_accuracy(2),
                              initial_model.single_class_accuracy(3),
                              initial_model.single_class_accuracy(4),
                              initial_model.single_class_accuracy(5),
                              initial_model.single_class_accuracy(6),
                              initial_model.single_class_accuracy(7),
                              'accuracy'
                          ])
            data, label = initial_model.generator_input_once(
                str(args.train_files) + str(hours) + 'hrs/train/input_' +
                str(hours) + 'hrs_8.npz', 6)

            scores = model.evaluate(x=data,
                                    y=label,
                                    batch_size=None,
                                    verbose=1,
                                    sample_weight=None,
                                    steps=1)
            print("\ntest " + str(hours) + 'hrs beofre train\n')
            print(scores)

            # data,label = initial_model.generator_input_once('/Volumes/TOSHIBA EXT/train_input/24hrs/train/input_24hrs_8.npz', 24)
            #
            # scores = model.evaluate(x=data, y=label, batch_size=None, verbose=1, sample_weight=None, steps=1)
            # print(scores)

            ###########sequential model#############

            train_file_names = os.path.join(str(args.train_files),
                                            str(hours) + 'hrs', 'train',
                                            '*npz')
            eval_file_names = os.path.join(args.eval_files,
                                           str(hours) + 'hrs', 'eval', '*npz')
            print("\n\ntraining " + str(hours) + 'hrs!\n\n')
            history_all.append(
                model.fit_generator(
                    initial_model.generator_input(train_file_names,
                                                  args.train_batch_size),
                    validation_data=initial_model.generator_input(
                        eval_file_names, args.eval_batch_size),
                    steps_per_epoch=args.train_steps,
                    validation_steps=args.eval_steps,
                    epochs=args.num_epochs,
                    callbacks=callbacks))
            weights = model.get_weights()
            weights_all.append(weights)
            weights_0 = []
            for i in range(int(len(weights) / 4)):
                if i == int(len(weights) / 4) - 1:
                    weights_0.extend([
                        np.concatenate((weights[i * 4 + 2], weights[i * 4]),
                                       axis=0),
                        (weights[i * 4 + 1] + weights[i * 4 + 3])
                    ])
                elif i == int(len(weights) / 4) - 2:
                    weights_0.extend([
                        np.concatenate((weights[i * 4 + 2], weights[i * 4]),
                                       axis=1),
                        np.concatenate(
                            (weights[i * 4 + 3], weights[i * 4 + 1]))
                    ])
                else:

                    weights_0.extend([
                        np.concatenate((weights[i * 4], weights[i * 4 + 2]),
                                       axis=1),
                        np.concatenate(
                            (weights[i * 4 + 1], weights[i * 4 + 3]))
                    ])
            # weights_0 = [np.concatenate((weights[0], weights[2]), axis=1),
            #              np.concatenate((weights[1], weights[3])),
            #              np.concatenate((weights[4], weights[6]), axis=1),
            #              np.concatenate((weights[5], weights[7])),
            #              np.concatenate((weights[8], weights[10]), axis=0)
            #               weights[9] + weights[11]]
            data, label = initial_model.generator_input_once(
                str(args.train_files) + str(hours) + 'hrs/train/input_' +
                str(hours) + 'hrs_8.npz', 6)

            scores = model.evaluate(x=data,
                                    y=label,
                                    batch_size=None,
                                    verbose=1,
                                    sample_weight=None,
                                    steps=1)
            print("\ntest " + str(hours) + 'hrs after train\n')
            print(scores)
            with open(
                    os.path.join(str(args.job_dir), 'weights',
                                 str(hours) + 'hrs_weights'), 'wb') as fp:
                pickle.dump(weights_0, fp)
            DISK_MODEL = 'disk_model' + str(hours) + '.hdf5'
            if args.job_dir.startswith('gs://'):
                model.save(DISK_MODEL)
                copy_file_to_gcs(args.job_dir, DISK_MODEL)
            else:
                model.save(os.path.join(args.job_dir, DISK_MODEL))
            seq_id = seq_id + 1

    with open(os.path.join(str(args.job_dir), 'histroy_all'), 'wb') as fp:
        pickle.dump(history_all, fp)
Exemplo n.º 16
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             eval_num_epochs, num_epochs, checkpoint_epochs,
             image_input_prefix, debug_mode):
    census_model = model.model_fn()

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    meta_data = get_meta(train_files)
    indexes = [i for i in range(len(meta_data))]
    random.shuffle(indexes)
    meta_data = meta_data.loc[indexes].reset_index(drop=True)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency, meta_data, image_input_prefix,
                                eval_files, learning_rate, job_dir, debug_mode)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tblog]

    train_data_sequence = DataSequence(image_input_prefix,
                                       train_files,
                                       debug_mode,
                                       meta_data,
                                       batch_size=train_batch_size,
                                       data_type='train')
    census_model.fit_generator(
        #model.generator_input(train_files, chunk_size=CHUNK_SIZE),
        train_data_sequence,
        steps_per_epoch=train_data_sequence.length,
        epochs=num_epochs,
        callbacks=callbacks)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if job_dir.startswith("gs://"):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
Exemplo n.º 17
0
def dispatch(multi_gpu, train_files, eval_files, job_dir, train_steps,
             train_batch_size, num_epochs, learning_rate, stddev, eval_steps,
             eval_batch_size, eval_num_epochs, eval_frequency,
             checkpoint_epochs, gpus, workers, verbose):
    """
    Main training method:
    """

    # random seed
    random.seed(42)

    # load encoder:
    encoder = Encoder(alphabet=ALPHABET,
                      maxlen=MAXLEN,
                      num_classes=NUM_CLASSES,
                      clear_accents=CLEAR_ACCENTS)

    # prepare data generator sequences:
    train_sequence = DataSequence(
        input_file=train_files,
        label_column=LABEL_COLUMN,
        data_columns=DATA_COLUMNS,
        encoder=encoder,
        backwards=REVERSE_ENCODING,
        batch_size=train_batch_size,
        # workaround bc sequence.__len__ overwrites fit_generator arg
        steps_per_epoch=train_steps,
        shuffle=SHUFFLE)

    eval_sequence = DataSequence(
        input_file=eval_files,
        label_column=LABEL_COLUMN,
        data_columns=DATA_COLUMNS,
        encoder=encoder,
        backwards=REVERSE_ENCODING,
        batch_size=eval_batch_size,
        # workaround bc sequence.__len__ overwrites fit_generator arg
        steps_per_epoch=eval_steps,
        shuffle=SHUFFLE)

    # prepare log dictionaries
    job_dir += '/' + time.strftime("%Y%m%d-%H%M%S")
    try:
        os.makedirs(job_dir)
    except:
        print("ERROR: Directory 'job-dir' could not be created.")

    # workaround bc h5py cannot write to GCS
    # save to local filesystem, then copy over to GCS
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Learning rate scheduler callback --unused for the moment
    cb_learning_rate_scheduler = LearningRateScheduler(learning_rate_scheduler)

    # Detached model checkpoint callback to snapshot multi-gpu models
    detached_checkpoint = ModelCheckpointDetached(checkpoint_path,
                                                  monitor='acc',
                                                  verbose=1,
                                                  period=checkpoint_epochs,
                                                  mode='max')

    # Continuous eval callback, eval & copy checkpoints to gcs
    evaluation = ContinuousEval(
        eval_frequency=eval_frequency,
        eval_sequence=eval_sequence,
        # eval_generator=eval_generator,
        learning_rate=learning_rate,
        momentum=MOMENTUM,
        job_dir=job_dir,
        steps=eval_steps)

    # Tensorboard logs callback
    tblog = TensorBoard(log_dir=os.path.join(job_dir, 'tb-logs'),
                        histogram_freq=0,
                        write_graph=True,
                        embeddings_freq=0)

    callbacks = [
        # cb_learning_rate_scheduler,
        detached_checkpoint,
        evaluation,
        tblog,
    ]

    # load model:
    with tf.device('/cpu:0'):
        conv_model = model.model_fn(
            maxlen=MAXLEN,
            vocab_size=encoder.vocab_size,
            conv_filters=CONV_FILTERS_SMALL,
            conv_kernels=CONV_KERNELS,
            # conv_padding=conv_padding,
            # conv_activation=conv_activation,
            max_pooling=MAX_POOLING,
            dense_output_units=DENSE_OUTPUT_UNITS_SMALL,
            # dense_activation=dense_activation,
            dropout_probs=DROPOUT_PROBS,
            output_cats=NUM_CLASSES,
            # output_activation=output_activation,
            # optimizer=optimizer,
            learning_rate=learning_rate,
            momentum=MOMENTUM,
            stddev=stddev,
            # loss=loss,
            # metrics=metrics
        )

    if multi_gpu:
        # Replicate the model on multiple GPUs:
        parallel_model = multi_gpu_model(conv_model, gpus=gpus)
        parallel_model.compile(loss='categorical_crossentropy',
                               optimizer=SGD(lr=learning_rate,
                                             momentum=MOMENTUM),
                               metrics=['categorical_accuracy'])

    with tf.device('/cpu:0'):
        # compile local model
        conv_model.compile(loss='categorical_crossentropy',
                           optimizer=SGD(lr=learning_rate, momentum=MOMENTUM),
                           metrics=['categorical_accuracy'])
        conv_model.summary()

    if multi_gpu:
        parallel_model.fit_generator(
            callbacks=callbacks,
            # generator=train_generator,
            generator=train_sequence,
            steps_per_epoch=train_steps,
            epochs=num_epochs,
            workers=workers,
            # verbose: 0 = silent, 1 = progress bar, 2 = one line per epoch
            verbose=verbose)
        conv_model.set_weights(parallel_model.get_weights())
    else:
        conv_model.fit_generator(callbacks=callbacks,
                                 generator=train_sequence,
                                 steps_per_epoch=train_steps,
                                 epochs=num_epochs,
                                 workers=workers,
                                 verbose=verbose)

    # workaround bc h5py cannot write to GCS
    # save to local filesystem, then copy over to GCS
    if job_dir.startswith("gs://"):
        conv_model.save(CONV_MODEL)
        copy_file_to_gcs(job_dir, CONV_MODEL)
    else:
        conv_model.save(os.path.join(job_dir, CONV_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(conv_model, os.path.join(job_dir, 'export'))
Exemplo n.º 18
0
def train_and_evaluate(args):

    # confirm whether training datasets need to be created
    if args.create_data == True:
        import trainer.create_data_func as create_data_func

        logging.info('Begin creating datasets')
        for data_part in ['train', 'val', 'test']:
            create_data_func.create_data_func(data_part, args.project_id,
                                              args.bucket_name,
                                              args.dataset_id)

        logging.info('End creating datasets')

    # Create config file and store project id there so that model.py can read it.
    with open('config.py', 'w') as f:
        f.write("PROJECT_ID=\"{}\"\n".format(args.project_id))
        f.write("BUCKET_NAME=\"{}\"\n".format(args.bucket_name))

    # import after datasets are created as they are referenced immediately when this module is initiated
    import trainer.model as model

    # if new datasets are created, scaler also need to be created
    if args.create_data == True:
        import trainer.create_scaler_func as create_scaler_func

        logging.info('Begin fitting scaler')
        create_scaler_func.create_scaler_func(args.train_files,
                                              model.CSV_COLUMNS,
                                              model.LABEL_COLUMN,
                                              args.bucket_name,
                                              args.project_id)

        logging.info('End fitting scalers')

    # download the scaler
    if not path.exists('x_scaler'):
        logging.info('Downloading scaler')
        storage_client = storage.Client(project=args.project_id)
        bucket = storage_client.get_bucket(args.bucket_name)
        blob = bucket.blob('scalers/x_scaler')
        blob.download_to_filename('x_scaler')
        logging.info('Downloaded scaler')

    x_scaler = joblib.load('x_scaler')

    # build the model
    census_model = model.model_fn(
        learning_rate=args.learning_rate,
        num_deep_layers=args.num_deep_layers,
        first_deep_layer_size=args.first_deep_layer_size,
        first_wide_layer_size=args.first_wide_layer_size,
        wide_scale_factor=args.wide_scale_factor,
        dropout_rate=args.dropout_rate)
    logging.info(census_model.summary())

    try:
        os.makedirs(args.job_dir)
    except:
        pass

    checkpoint_path = os.path.join(args.job_dir, CHECKPOINT_FILE_PATH)

    # Model checkpoint callback.
    checkpoint = ModelCheckpoint(
        checkpoint_path,
        monitor='val_mse',  # 'mean_squared_error'
        verbose=1,
        period=args.checkpoint_epochs,
        save_best_only=True,
        mode='min')

    # Early stopping callback.
    early_stop = EarlyStopping(monitor='val_mse',
                               patience=10)  # 'mean_squared_error'

    # Tensorboard logs callback.
    tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'),
                         histogram_freq=0,
                         write_graph=True,
                         embeddings_freq=0)

    callbacks = [checkpoint, early_stop, tb_log]

    # fit the model on the training set
    census_model.fit_generator(
        generator=model.generator_input(args.train_files,
                                        chunk_size=CHUNK_SIZE,
                                        project_id=args.project_id,
                                        bucket_name=args.bucket_name,
                                        x_scaler=x_scaler),
        steps_per_epoch=args.train_steps,
        epochs=args.num_epochs,
        callbacks=callbacks,
        validation_data=model.generator_input(args.eval_files,
                                              chunk_size=CHUNK_SIZE,
                                              project_id=args.project_id,
                                              bucket_name=args.bucket_name,
                                              x_scaler=x_scaler),
        validation_steps=args.eval_steps)

    # evaluate model on test set
    loss, mae, mse = census_model.evaluate_generator(model.generator_input(
        args.test_files,
        chunk_size=CHUNK_SIZE,
        project_id=args.project_id,
        bucket_name=args.bucket_name,
        x_scaler=x_scaler),
                                                     steps=args.test_steps)
    logging.info('\nTest evaluation metrics[{:.2f}, {:.2f}, {:.2f}] {}'.format(
        loss, mae, mse, census_model.metrics_names))

    # Unhappy hack to workaround h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if args.job_dir.startswith('gs://'):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(args.job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(args.job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel.
    model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export'))
Exemplo n.º 19
0
def run(target, cluster_spec, is_chief, args):

  """Runs the training and evaluation graph.

  Args:
    target (str): Tensorflow server target.
    cluster_spec: (cluster spec) Cluster specification.
    is_chief (bool): Boolean flag to specify a chief server.
    args (args): Input Arguments.
  """

  # Calculate the number of hidden units
  hidden_units = [
      max(2, int(args.first_layer_size * args.scale_factor**i))
      for i in range(args.num_layers)
  ]

  # If the server is chief which is `master`
  # In between graph replication Chief is one node in
  # the cluster with extra responsibility and by default
  # is worker task zero. We have assigned master as the chief.
  #
  # See https://youtu.be/la_M6bCV91M?t=1203 for details on
  # distributed TensorFlow and motivation about chief.
  if is_chief:
    tf.logging.info('Created DNN hidden units {}'.format(hidden_units))
    evaluation_graph = tf.Graph()
    with evaluation_graph.as_default():

      # Features and label tensors
      features, labels = model.input_fn(
        args.eval_files,
          num_epochs=None if args.eval_steps else 1,
          batch_size=args.eval_batch_size,
          shuffle=False
      )
      # Accuracy and AUROC metrics
      # model.model_fn returns the dict when EVAL mode
      metric_dict = model.model_fn(
          model.EVAL,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=args.learning_rate
      )

    hooks = [EvaluationRunHook(
        args.job_dir,
        metric_dict,
        evaluation_graph,
        args.eval_frequency,
        eval_steps=args.eval_steps,
    )]
  else:
    hooks = []

  # Create a new graph and specify that as default.
  with tf.Graph().as_default():
    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers.
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

      # Features and label tensors as read using filename queue.
      features, labels = model.input_fn(
          args.train_files,
          num_epochs=args.num_epochs,
          batch_size=args.train_batch_size
      )

      # Returns the training graph and global step tensor.
      train_op, global_step_tensor = model.model_fn(
          model.TRAIN,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=args.learning_rate
      )

    # Creates a MonitoredSession for training.
    # MonitoredSession is a Session-like object that handles
    # initialization, recovery and hooks
    # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
    with tf.train.MonitoredTrainingSession(master=target,
                                           is_chief=is_chief,
                                           checkpoint_dir=args.job_dir,
                                           hooks=hooks,
                                           save_checkpoint_secs=20,
                                           save_summaries_steps=50) as session:
      # Global step to keep track of global number of steps particularly in
      # distributed setting
      step = global_step_tensor.eval(session=session)

      # Run the training graph which returns the step number as tracked by
      # the global step tensor.
      # When train epochs is reached, session.should_stop() will be true.
      while (args.train_steps is None or
             step < args.train_steps) and not session.should_stop():
        step, _ = session.run([global_step_tensor, train_op])

    # Find the filename of the latest saved checkpoint file
    latest_checkpoint = tf.train.latest_checkpoint(args.job_dir)

    # Only perform this if chief
    if is_chief:
      build_and_run_exports(latest_checkpoint,
                            args.job_dir,
                            model.SERVING_INPUT_FUNCTIONS[args.export_format],
                            hidden_units)
Exemplo n.º 20
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             learning_rate, eval_frequency, num_epochs, checkpoint_epochs,
             gpus):

    # With severals GPU you use 2 models, un for training and other for store.
    # The first one is assigned to the CPU,
    # the other run in the GPU and is generated using multi_gpu_model
    if gpus <= 1:
        model_train = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE)
        model_save = model_train
    else:
        with tf.device("/cpu:0"):
            model_save = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE)
        model_train = multi_gpu_model(model_save, gpus=gpus)
        model.compile_model(model_save, learning_rate)
        print(model_save.summary())
    model.compile_model(model_train, learning_rate)
    print(model_train.summary())

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem,
    # then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=0,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency,
                                eval_files,
                                learning_rate,
                                job_dir,
                                steps=eval_steps)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tblog]

    x, y = model.get_array_x_y(train_files, train_steps, WINDOWS_SIZE,
                               NUM_CHARS)

    model_train.fit(x,
                    y,
                    epochs=num_epochs,
                    callbacks=callbacks,
                    batch_size=500)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem,
    # then copy them over to GCS.
    if job_dir.startswith("gs://"):
        model_save.save(BEIRAS_MODEL)
        copy_file_to_gcs(job_dir, BEIRAS_MODEL)
    else:
        model_save.save(os.path.join(job_dir, BEIRAS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(model_save, os.path.join(job_dir, 'export'))
Exemplo n.º 21
0
def dispatch(train_files, eval_files, job_dir, learning_rate, eval_frequency,
             num_epochs, checkpoint_epochs):

    # setting the seed for reproducibility
    np.random.seed(13)

    # get all data and build labelencoder and onehotencoder
    full_dataset = model.get_all_data(train_files + eval_files)

    # convert values in categorical columns to numerical 0-n
    labelencoder_DayOfWeek = model.build_labelencoder('DayOfWeek',
                                                      full_dataset)
    labelencoder_StoreType = model.build_labelencoder('StoreType',
                                                      full_dataset)
    labelencoder_Assortment = model.build_labelencoder('Assortment',
                                                       full_dataset)

    # NOTE: apply label encoders before build onehotencoder
    model.apply_labelencoder('DayOfWeek', labelencoder_DayOfWeek, full_dataset)
    model.apply_labelencoder('StoreType', labelencoder_StoreType, full_dataset)
    model.apply_labelencoder('Assortment', labelencoder_Assortment,
                             full_dataset)

    # DayOfWeek should be considered as categorical data and not as numerical
    onehotencoder = model.build_onehotencoder(
        ['DayOfWeek', 'StoreType', 'Assortment'], full_dataset)
    #onehotencoder_DayOfWeek = model.build_onehotencoder_DayOfWeek(full_dataset)
    full_dataset = model.getOneHotEncodedData(onehotencoder, full_dataset)

    # NOTE: must be called after apply Label- and OneHot- Encoder
    scaler = model.build_scaler(full_dataset)

    # finally we can create our model
    input_data_shape = model.get_input_shape(full_dataset)
    forecast_model = model.model_fn(input_data_shape)

    try:
        os.makedirs(job_dir)
    except Exception as e:
        print(e)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = CHECKPOINT_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 verbose=1,
                                                 period=checkpoint_epochs)

    # Continuous eval callback
    with ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir,
                        scaler, labelencoder_DayOfWeek, labelencoder_StoreType,
                        labelencoder_Assortment, onehotencoder) as evaluation:

        # Tensorboard logs callback
        tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(
            job_dir, 'logs'),
                                            histogram_freq=0,
                                            write_graph=True,
                                            embeddings_freq=0)

        callbacks = [checkpoint, evaluation, tblog]

        x, y = model.load_features(train_files, scaler, labelencoder_DayOfWeek,
                                   labelencoder_StoreType,
                                   labelencoder_Assortment, onehotencoder)
        forecast_model.fit(x, y, epochs=num_epochs, callbacks=callbacks)

        # Unhappy hack to work around h5py not being able to write to GCS.
        # Force snapshots and saves to local filesystem, then copy them over to GCS.
        if job_dir.startswith("gs://"):
            forecast_model.save(MODEL_FILENAME)
            copy_file_to_gcs(job_dir, MODEL_FILENAME)
        else:
            forecast_model.save(os.path.join(job_dir, MODEL_FILENAME))
Exemplo n.º 22
0
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir,
        train_files, eval_files, train_batch_size, eval_batch_size,
        learning_rate, eval_frequency, first_layer_size, num_layers,
        scale_factor, num_epochs, export_format):
    """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(first_layer_size * scale_factor**i))
        for i in range(num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info("Created DNN hidden units {}".format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                eval_files,
                num_epochs=None if eval_steps else 1,
                batch_size=eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=learning_rate)

        hooks = [
            EvaluationRunHook(
                job_dir,
                metric_dict,
                evaluation_graph,
                eval_frequency,
                eval_steps=eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            features, labels = model.input_fn(train_files,
                                              num_epochs=num_epochs,
                                              batch_size=train_batch_size)

            # Returns the training graph and global step tensor
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=learning_rate)

        # Creates a MonitoredSession for training
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (train_steps is None
                   or step < train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(latest_checkpoint, job_dir,
                                  model.SERVING_INPUT_FUNCTIONS[export_format],
                                  hidden_units)
Exemplo n.º 23
0
def dispatch(train_prefix, validation_prefix, job_dir, learning_rate,
             num_epochs, checkpoint_epochs, lam, dropout, model_file):

    # download train data
    #train_tmp_prefix, val_tmp_prefix = download_mats(train_prefix, validation_prefix)
    train_tmp_prefix = train_prefix
    val_tmp_prefix = validation_prefix
    print(train_tmp_prefix, val_tmp_prefix)

    # download train data
    #validation_tmp_prefix = download_mats(validation_prefix)

    #train_x, train_y, cv_x, cv_y, input_shape = create_data(train_tmp_prefix)

    logger = logging.getLogger()
    sh = StreamHandler(stdout)
    logger.addHandler(sh)
    logger.setLevel(logging.INFO)
    logger.info('learning_rate=%s' % learning_rate)
    if model_file is not None:
        if model_file.startswith('gs://'):
            cmd = 'gsutil cp %s /tmp' % model_file
            subprocess.check_call(cmd.split())
            real_model_file = '/tmp/%s' % model_file.split('/')[-1]
        else:
            real_model_file = model_file
        face_age_model = load_model(real_model_file, compile=False)
        face_age_model = model.compile_model(face_age_model, learning_rate)
    else:
        face_age_model = model.model_fn(learning_rate, lam, dropout)

    try:
        os.makedirs(job_dir)
    except Exception:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to
    # GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)
        verbose = 1
        multi = False
        num_worker = 1
    else:
        verbose = 2
        multi = False
        num_worker = 1  #multiprocessing.cpu_count()

#
#     meta_data = get_meta(train_files)
#     indexes = [i for i in range(len(meta_data))]
#     random.shuffle(indexes)
#     meta_data = meta_data.loc[indexes].reset_index(drop=True)

# Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    val_datasequence = FileDataSequence(val_tmp_prefix)
    #     evaluation = ContinuousEval(eval_frequency,
    #                                 # validation_tmp_prefix,
    #                                 val_datasequence,
    #                                 learning_rate,
    #                                 job_dir,
    #                                 )

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, tblog]

    train_data_sequence = FileDataSequence(train_tmp_prefix)
    #x_train, y_train = train_data_sequence.__getitem__(0)
    #     test_data_sequence = DataSequence(
    #         validation_tmp_prefix
    #     )

    face_age_model.fit_generator(  # x_train, y_train,
        #model.generator_input(train_files, chunk_size=CHUNK_SIZE),
        train_data_sequence,
        validation_data=val_datasequence,
        validation_steps=val_datasequence.length,
        steps_per_epoch=train_data_sequence.length,
        verbose=verbose,
        epochs=num_epochs,
        callbacks=callbacks)

    # plot_history(history)
    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to
    # GCS.
    if job_dir.startswith("gs://"):
        face_age_model.save(FACE_AGE_MODEL)
        copy_file_to_gcs(job_dir, FACE_AGE_MODEL)
    else:
        face_age_model.save(os.path.join(job_dir, FACE_AGE_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(face_age_model, os.path.join(job_dir, 'export'))