def predict(self, test_paths: Union[str, Iterable[str]]) -> None: """ Generate predictions from plain text corpora at a given path or list of paths. :param test_paths: paths to documents for which to generate predictions """ if not isinstance(test_paths, Iterable): test_paths = [test_paths] if not self._feature_extractor: self._init_feature_extractor() # initialize predictor from saved trained model predictor = from_job_dir(self._job_dir) for test_set in test_paths: prediction_path = os.path.join( self._job_dir, os.path.basename(test_set) + '.predictions.txt') logging.info('Writing predictions on %s to %s' % (test_set, prediction_path)) with file_io.FileIO(prediction_path, mode="w") as output: with file_io.FileIO(test_set, mode="r") as text_lines: for line in text_lines: line = line.strip() if not line: continue predictions = predictor.predict(line) for prediction in predictions: output.write(str(prediction) + '\n') output.write('\n')
def main(_): label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path) examples_list = glob(os.path.join('images', '*.jpg')) logging.info('Found {} images'.format(len(examples_list))) # We automatically split into training and validation set random.seed(42) random.shuffle(examples_list) num_examples = len(examples_list) num_train = int(0.7 * num_examples) train_examples = examples_list[:num_train] val_examples = examples_list[num_train:] logging.info('%d training and %d validation examples.', len(train_examples), len(val_examples)) train_output_path = os.path.join(FLAGS.output_dir, 'qcards_train.record') val_output_path = os.path.join(FLAGS.output_dir, 'qcards_val.record') writer = tf.io.TFRecordWriter(train_output_path) for example in tqdm(train_examples): tf_example = create_tf_example(example, label_map_dict=label_map_dict) writer.write(tf_example.SerializeToString()) writer.close() writer = tf.io.TFRecordWriter(val_output_path) for example in tqdm(val_examples): tf_example = create_tf_example(example, label_map_dict=label_map_dict) writer.write(tf_example.SerializeToString()) writer.close()
def main(unused_argv): tf.compat.v1.logging.set_verbosity(logging.INFO) paths = tf.io.gfile.glob(FLAGS.input_data_pattern) logging.info("Found %s files.", len(paths)) for path in paths: with tf.io.gfile.GFile(path, "rb") as f: first_read = True while True: length_raw = f.read(8) if not length_raw and first_read: logging.fatal("File %s has no data.", path) break elif not length_raw: logging.info("File %s looks good.", path) break else: first_read = False if len(length_raw) != 8: logging.fatal("File ends when reading record length: " + path) break length, = struct.unpack("Q", length_raw) # +8 to include the crc values. record = f.read(length + 8) if len(record) != length + 8: logging.fatal("File ends in the middle of a record: " + path) break
def eval(self, test_paths: Union[str, Iterable[str]]) -> None: """ Evaluate a trained model on a given corpus or list of corpora. :param test_paths: paths to test corpora """ if not isinstance(test_paths, Iterable): test_paths = [test_paths] if not self._feature_extractor: self._init_feature_extractor() # initialize predictor from saved trained model predictor = from_job_dir(self._job_dir) # get function that is used to evaluate predictions from configuration for test_set in test_paths: logging.info('Evaluating on %s' % test_set) output_path = os.path.join(self._job_dir, os.path.basename(test_set) + '.eval') evaluation_fn = get_evaluator(self._training_config.heads, self._feature_extractor, output_path, self._eval_script_path) # extract instances from test file at given path--this is a generator, so wrap in a list # predict from instances instead of raw text, so use .predict_inputs, don't format since we need the raw predictions processed_examples = predictor.predict_parsed(self._extract_raw( test_set, True), formatted=False) # call evaluation function on predictions evaluation_fn(self._extract_raw(test_set, True), processed_examples)
def _extract_and_write(self, path: str, test: bool = False): output_path = self._data_path_fn(path) if gfile.exists(output_path): logging.info("Using pre-existing features for %s from %s", path, output_path) return examples = self._extract_features(path, test) write_features(examples, output_path)
def train_epoch(dataset): total_loss = 0. num_batches = 0 for batch, inputs in enumerate(dataset): input_tr, target_tr = inputs total_loss += train_step(input_tr, target_tr, batch == 0) num_batches += 1 logging.info("total batches: {}".format(num_batches)) return total_loss / num_batches
def _train_vocab(self, train_path: str): logging.info("Creating new vocabulary using training data at %s", train_path) self._feature_extractor.initialize(self._resources) self._feature_extractor.train(self._extract_raw(train_path)) logging.info("Writing new feature/label vocabulary to %s", self._vocab_path) self._feature_extractor.write_vocab(self._vocab_path, resources=self._resources, prune=True)
def restore_ckpt(sess, ckpt): saver = tf.train.Saver() logging.info(f"[LOAD]{ckpt}") try: saver.restore(sess, ckpt) except: print("======LOAD ERROR======") print_variables() print_ckpt(sess, ckpt) raise Exception return saver
def get_embedding_input(inputs, feature, training, weights=None): config = feature.config with variable_scope(feature.name): with variable_scope('embedding'): initializer = None if training: if feature.embedding is not None: initializer = embedding_initializer(feature.embedding) elif config.initializer.zero_init: logging.info("Zero init for feature embedding: %s", feature.name) initializer = tf.zeros_initializer else: logging.info( "Xavier Uniform init for feature embedding: %s", feature.name) initializer = tf.glorot_uniform_initializer embedding_matrix = get_variable( name='parameters', shape=[feature.vocab_size(), config.dim], initializer=initializer, trainable=config.trainable) if weights is None: feature_ids = string2index(inputs, feature) result = tf.nn.embedding_lookup( params=embedding_matrix, ids=feature_ids, name='lookup') # wrapper of gather else: result = tf.matmul(weights, embedding_matrix, name="weighted_lookup") if config.dropout > 0: result = tf.layers.dropout(result, rate=config.dropout, training=training, name='dropout') if 'func' in config: # reduce multiple vectors per token to a single vector with tf.name_scope('reduce'): result = config.func.apply(result) if config.word_dropout > 0 and training: shape = tf.shape(result) result = tf.layers.dropout(result, rate=config.word_dropout, training=training, name='word_dropout', noise_shape=[shape[0], shape[1], 1]) return result
def train_epoch(dataset): total_loss = 0. batch = 0 for inputs in tqdm(loop_dataset(dataset, args.batch_size)): input_tr, targets_tr = inputs new_target = (targets_tr.globals - target_mean) / target_scales targets_tr = targets_tr.replace(globals=new_target) total_loss += train_step(input_tr, targets_tr, batch == 0).numpy() batch += 1 logging.info("total batches: {}".format(batch)) return total_loss / batch, batch
def main(_): feat_dict = FeatureDictionary() print("feature_size: %d" % feat_dict.feature_size) print("field_size: %d" % feat_dict.field_size) print(feat_dict.col2feat_id.keys()) dataparser = DataParser(feat_dict, FLAGS.label) train_ids, train_vals, train_labels = dataparser.parse(infile="%s\\train_sample.csv" % FLAGS.data_dir) print("len of train: %d" % len(train_ids)) test_ids, test_vals, test_labels = dataparser.parse(infile="%s\\test_sample.csv" % FLAGS.data_dir) print("len of test: %d" % len(test_ids)) # ------bulid Tasks------ model_params = { "field_size": feat_dict.field_size, "feature_size": feat_dict.feature_size, "embedding_size": FLAGS.embedding_size, "learning_rate": FLAGS.learning_rate, "l2_reg": FLAGS.l2_reg, "deep_layers": FLAGS.deep_layers, "dropout": FLAGS.dropout, "experts_num": 3, "experts_units": 32, "use_experts_bias": True, "use_gate_bias": True } print(model_params) DeepFM = build_model_estimator(model_params) # DeepFM = tf.contrib.estimator.add_metrics(DeepFM, my_auc) if FLAGS.task_type == 'train': train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(train_ids, train_vals, train_labels, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size)) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(test_ids, test_vals, test_labels, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec) results = DeepFM.evaluate( input_fn=lambda: input_fn(test_ids, test_vals, test_labels, num_epochs=1, batch_size=FLAGS.batch_size)) for key in results: log.info("%s : %s" % (key, results[key])) elif FLAGS.task_type == 'eval': results = DeepFM.evaluate(input_fn=lambda: input_fn(test_ids, test_vals, test_labels, num_epochs=1, batch_size=FLAGS.batch_size)) for key in results: log.info("%s : %s" % (key, results[key])) elif FLAGS.task_type == 'infer': preds = DeepFM.predict(input_fn=lambda: input_fn(test_ids, test_vals, test_labels, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob") with open(FLAGS.data_dir+"/pred.txt", "w") as fo: for prob in preds: fo.write("%f\n" % (prob['prob']))
def embedding(features, feature_config, training): if feature_config.name == constants.ELMO_KEY: logging.info("Using ELMo module at %s", ELMO_URL) elmo_module = hub.Module(ELMO_URL, trainable=True) elmo_embedding = elmo_module(inputs={ 'tokens': features[constants.ELMO_KEY], 'sequence_len': tf.cast(features[constants.LENGTH_KEY], dtype=tf.int32) }, signature="tokens", as_dict=True)['elmo'] return elmo_embedding elif feature_config.name == constants.BERT_KEY: model = feature_config.options.get("model") logging.info("Using BERT module at %s", model) tags = set() if training: tags.add("train") bert_module = hub.Module(model, tags=tags, trainable=True) lens = features[constants.LENGTH_KEY] if constants.BERT_LENGTH_KEY in features: lens = features[constants.BERT_LENGTH_KEY] if constants.BERT_SEG_ID in features: segment_ids = tf.cast(features[constants.BERT_SEG_ID], dtype=tf.int32) else: segment_ids = tf.zeros(tf.shape(features[constants.BERT_KEY]), dtype=tf.int32) bert_inputs = dict( input_ids=tf.cast(features[constants.BERT_KEY], tf.int32), # mask over the sequence lengths, which extend over all BERT tokens in input_ids for each seq in the batch input_mask=tf.cast(tf.sequence_mask(lens), dtype=tf.int32), # we don't care about segment_ids since we're not supporting sentence pair tasks for now segment_ids=segment_ids) bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True) output_type = feature_config.options.get("output_type") bert_embedding = bert_outputs[output_type] if output_type == "pooled_output": bert_embedding = tf.expand_dims(bert_embedding, axis=1) return bert_embedding elif feature_config.has_vocab(): feature_embedding = get_embedding_input(features[feature_config.name], feature_config, training) return feature_embedding
def __init__(self, optimizer_config, **kwargs): super().__init__(**optimizer_config, **kwargs) self.name = optimizer_config.name self.params = optimizer_config.params if optimizer_config.get( 'params') else {} clip = optimizer_config.get('clip') if not clip: clip = 5.0 logging.info( "Using default global norm of gradient clipping threshold of %f", clip) self.clip = clip
def _copy_file(src_dir: str, dest_dir: str, file_name: str): src_path = src_dir + '/' + file_name dest_path = dest_dir + '/' + file_name for retries in range(0, 10): try: gfile.copy(src_path, dest_path, overwrite=True) logging.info("copy %s->%s succeeded (retry %d)", src_path, dest_path, retries) return except tf.errors.OpError as ex: logging.error("copy %s->%s (retry %d): %s", src_path, dest_path, retries, ex) time.sleep(1.5**retries)
def get_optimizer( network_config, default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)): """ Return the optimizer given by the input network configuration, or a default optimizer. :param network_config: network configuration :param default_optimizer: default optimization algorithm :return: configured optimizer """ try: optimizer = network_config.optimizer except KeyError: logging.info("Using Adadelta as default optimizer.") return default_optimizer if isinstance(optimizer.lr, numbers.Number): lr = optimizer.lr else: optimizer.lr.num_train_steps = network_config.max_steps optimizer.lr.steps_per_epoch = network_config.steps_per_epoch lr = get_learning_rate(optimizer.lr, train.get_global_step()) name = optimizer.name params = optimizer.params if "Adadelta" == name: opt = train.AdadeltaOptimizer(lr, **params) elif "Adam" == name: opt = train.AdamOptimizer(lr, **params) elif "LazyAdam" == name: opt = LazyAdamOptimizer(lr, **params) elif "LazyNadam" == name: opt = LazyNadamOptimizer(lr, **params) elif "SGD" == name: opt = train.GradientDescentOptimizer(lr) elif "Momentum" == name: opt = train.MomentumOptimizer(lr, **params) elif "Nadam" == name: opt = NadamOptimizerSparse(lr, **params) elif "bert" == name: opt = AdamWeightDecayOptimizer( lr, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Invalid optimizer name: {}".format(name)) return opt
def load_model_py(model, model_py, is_train=True, feed_embedded_layer=False, batch_size=None): pair = model_py.split(":") sys.path.append(os.getcwd()) if len(pair) >= 2: logging.info(f"[LOAD] {pair[1]} from {pair[0]}") mod = importlib.import_module(pair[0]) cls = getattr(mod, pair[1]) obj = cls() if model: model.build(obj, is_train, feed_embedded_layer, batch_size) return obj else: logging.info(f"[LOAD] {pair[0]}") mod = importlib.import_module(pair[0]) if model: model.build(mod, is_train, feed_embedded_layer, batch_size) return mod
def make_checkpoints(self): if self.ckpt_manager: return output_dir = self.output_dir model = self.model optimizer = self.optimizer ckpt_dir = os.path.join(output_dir, "checkpoints") self.checkpoint = tf.train.Checkpoint( optimizer=optimizer, model=model ) self.ckpt_manager = tf.train.CheckpointManager( self.checkpoint, directory=ckpt_dir, max_to_keep=20, keep_checkpoint_every_n_hours=1) logging.info("Loading latest checkpoint from: {}".format(ckpt_dir)) _ = self.checkpoint.restore(self.ckpt_manager.latest_checkpoint) self.ckpt_dir = ckpt_dir
def inference(gan, test_in, test_truth, log_dir, xlabels): checkpoint_dir = os.path.join(log_dir, "checkpoints") checkpoint = tf.train.Checkpoint( generator=gan.generator, discriminator=gan.discriminator) ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=None) logging.info("Loading latest checkpoint from: {}".format(checkpoint_dir)) _ = checkpoint.restore(ckpt_manager.latest_checkpoint).expect_partial() AUTO = tf.data.experimental.AUTOTUNE noise = np.random.normal(loc=0., scale=1., size=(test_truth.shape[0], gan.noise_dim)) test_in = np.concatenate( [test_in, noise], axis=1).astype(np.float32) if test_in is not None else noise testing_data = tf.data.Dataset.from_tensor_slices( (test_in, test_truth)).batch(batch_size, drop_remainder=True).prefetch(AUTO) summary_dir = os.path.join(log_dir, "logs_inference") summary_writer = tf.summary.create_file_writer(summary_dir) img_dir = os.path.join(log_dir, 'img_inference') os.makedirs(img_dir, exist_ok=True)
def get_l2_loss(network_config, variables): if not network_config.optimizer or not network_config.optimizer.get( 'l2_loss'): return 0 l2_loss = network_config.optimizer.get('l2_loss') if isinstance(l2_loss, numbers.Number): return tf.add_n( [tf.nn.l2_loss(v) for v in variables if 'bias' not in v.name]) * l2_loss if not isinstance(l2_loss, dict): raise ValueError( "'l2_loss' expects a dictionary from regular expressions matching variable names to L2 terms," " e.g. {\".*scalar.*\": 0.001}, or a single L2 term to be applied globally to non-bias weights." ) all_losses = [] for var_pattern, alpha in l2_loss.items(): for var in [v for v in variables if re.match(var_pattern, v.name)]: logging.info('Adding L2 regularization with alpha=%f to %s' % (alpha, var.name)) all_losses.append(alpha * tf.nn.l2_loss(var)) return tf.add_n(all_losses)
def _compute_steps(self, train, valid): train_count = sum(1 for _ in tf.python_io.tf_record_iterator( self._data_path_fn(train))) valid_count = sum(1 for _ in tf.python_io.tf_record_iterator( self._data_path_fn(valid))) steps_per_epoch = train_count // self._training_config.batch_size if not self._training_config.max_epochs: if not self._training_config.max_steps: self._training_config.max_epochs = 100 else: self._training_config.max_epochs = self._training_config.max_steps // steps_per_epoch if not self._training_config.patience_epochs: self._training_config.patience_epochs = 5 if not self._training_config.checkpoint_epochs: self._training_config.checkpoint_epochs = 1 max_steps = self._training_config.max_epochs * steps_per_epoch patience = self._training_config.patience_epochs * steps_per_epoch checkpoint_steps = self._training_config.checkpoint_epochs * steps_per_epoch logging.info( 'Training on %d instances at %s, validating on %d instances at %s' % (train_count, train, valid_count, valid)) logging.info( 'Training for a maximum of %d epoch(s) (%d steps w/ batch_size=%d)' % (self._training_config.max_epochs, max_steps, self._training_config.batch_size)) if patience < max_steps: logging.info( 'Early stopping after %d epoch(s) (%d steps) with no improvement on validation set' % (self._training_config.patience_epochs, patience)) logging.info( 'Evaluating every %d steps, %d epoch(s)' % (checkpoint_steps, self._training_config.checkpoint_epochs)) return max_steps, patience, checkpoint_steps, steps_per_epoch
def _init_feature_extractor(self, train_path: str = None): self._feature_extractor = get_feature_extractor( self._training_config.features, self._training_config.heads) logging.info( "Checking for pre-existing vocabulary at vocabulary at %s", self._vocab_path) if self._feature_extractor.read_vocab(self._vocab_path): logging.info("Loaded pre-existing vocabulary at %s", self._vocab_path) elif train_path: logging.info( "No valid pre-existing vocabulary found at %s " "(this is normal when not loading from an existing model)", self._vocab_path) self._train_vocab(train_path) else: raise ValueError( 'No feature vocabulary available at %s and unable to train new vocabulary' % self._vocab_path)
""" This is a simple MLP-base conditional GAN. Same as gan.py except that the conditional input is given to the discriminator. """ import numpy as np import os import tensorflow as tf from tensorflow.compat.v1 import logging logging.info("TF Version:{}".format(tf.__version__)) gpus = tf.config.experimental.list_physical_devices("GPU") logging.info("found {} GPUs".format(len(gpus))) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) from tensorflow import keras from tensorflow.keras import layers import tqdm cross_entropy = keras.losses.BinaryCrossentropy(from_logits=False) def discriminator_loss(real_output, fake_output): real_loss = cross_entropy(tf.ones_like(real_output), real_output) fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output) total_loss = real_loss + fake_loss return tf.reduce_mean(total_loss)
def create_tf_example(example, label_map_dict): mask_paths = get_mask_paths(example) with tf.io.gfile.GFile(example, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) if image.format != 'JPEG': raise ValueError('Image format not JPEG') key = hashlib.sha256(encoded_jpg).hexdigest() classes = [] masks = [] bboxes = [] for label, mp in mask_paths.items(): mask = cv2.imread(mp, cv2.IMREAD_UNCHANGED) mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY) mask = cv2.threshold(mask, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] ret, labels = cv2.connectedComponents(mask) # The first component is the background, so we skip it for l in range(1, ret): cardmask = np.zeros(labels.shape, dtype=np.uint8) cardmask[labels == l] = 1 if np.sum(cardmask) > 2000: bbox = bounding_box(cardmask) classes.append(label) masks.append(cardmask) bboxes.append(bbox) else: logging.info( "%s: object %s discarded, item too small. Size %d", example, label, np.sum(cardmask)) #height = image.shape[1] # Image height #width = image.shape[0] # Image width width, height = image.size filename = example encoded_image_data = encoded_jpg # Encoded image bytes image_format = 'jpeg' # b'jpeg' or b'png' xmins = [ bb[2] / width for bb in bboxes ] # List of normalized left x coordinates in bounding box (1 per box) xmaxs = [bb[3] / width for bb in bboxes ] # List of normalized right x coordinates in bounding box # (1 per box) ymins = [ bb[0] / width for bb in bboxes ] # List of normalized top y coordinates in bounding box (1 per box) ymaxs = [bb[1] / width for bb in bboxes ] # List of normalized bottom y coordinates in bounding box # (1 per box) classes_text = map( lambda x: x.encode('utf8'), classes) # List of string class name of bounding box (1 per box) classes = list( map(lambda x: label_map_dict[x], classes)) # List of integer class id of bounding box (1 per box) encoded_mask_png_list = [] for mask in masks: img = PIL.Image.fromarray(mask) output = io.BytesIO() img.save(output, format='PNG') encoded_mask_png_list.append(output.getvalue()) tf_example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(filename.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_image_data), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/format': dataset_util.bytes_feature(image_format.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), 'image/object/mask': dataset_util.bytes_list_feature(encoded_mask_png_list) })) return tf_example
import random import functools import six import numpy as np import sklearn.metrics from graph_nets import utils_tf from graph_nets import utils_np import sonnet as snt from types import SimpleNamespace import tensorflow as tf from tensorflow.compat.v1 import logging logging.info("TF Version:{}".format(tf.__version__)) import horovod.tensorflow as hvd from root_gnn import model as all_models from root_gnn.src.datasets import topreco from root_gnn.src.datasets import graph from root_gnn.utils import load_yaml target_scales = np.array( [145.34593924, 145.57711889, 432.92148524, 281.44161905, 1, 1] * topreco.n_max_tops).T.reshape((-1, )) target_mean = np.array( [6.74674671e-02, -6.17142186e-02, 4.18239305e-01, 4.24881531e+02, 0, 0] * topreco.n_max_tops).T.reshape((-1, ))
def train_and_evaluate(args): dist = init_workers(args.distributed) device = 'CPU' gpus = tf.config.experimental.list_physical_devices("GPU") logging.info("found {} GPUs".format(len(gpus))) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if len(gpus) > 0: device = "{}GPUs".format(len(gpus)) if gpus and args.distributed: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') output_dir = args.output_dir if dist.rank == 0: os.makedirs(output_dir, exist_ok=True) logging.info("Checkpoints and models saved at {}".format(output_dir)) num_processing_steps_tr = args.num_iters ## level of message-passing n_epochs = args.max_epochs logging.info("{} epochs with batch size {}".format(n_epochs, args.batch_size)) logging.info( "{} processing steps in the model".format(num_processing_steps_tr)) logging.info("I am in hvd rank: {} of total {} ranks".format( dist.rank, dist.size)) if dist.rank == 0: train_input_dir = os.path.join(args.input_dir, 'train') val_input_dir = os.path.join(args.input_dir, 'val') train_files = tf.io.gfile.glob( os.path.join(train_input_dir, args.patterns)) eval_files = tf.io.gfile.glob( os.path.join(val_input_dir, args.patterns)) ## split the number of files evenly to all ranks train_files = [ x.tolist() for x in np.array_split(train_files, dist.size) ] eval_files = [ x.tolist() for x in np.array_split(eval_files, dist.size) ] else: train_files = None eval_files = None if args.distributed: train_files = dist.comm.scatter(train_files, root=0) eval_files = dist.comm.scatter(eval_files, root=0) else: train_files = train_files[0] eval_files = eval_files[0] logging.info( "rank {} has {} training files and {} evaluation files".format( dist.rank, len(train_files), len(eval_files))) AUTO = tf.data.experimental.AUTOTUNE training_dataset, ngraphs_train = read_dataset(train_files) training_dataset = training_dataset.prefetch(AUTO) input_signature = get_input_signature(training_dataset, args.batch_size) learning_rate = args.learning_rate optimizer = snt.optimizers.Adam(learning_rate) model = getattr(all_models, 'FourTopPredictor')() checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt_manager = tf.train.CheckpointManager(checkpoint, directory=output_dir, max_to_keep=5, keep_checkpoint_every_n_hours=8) logging.info("Loading latest checkpoint from: {}".format(output_dir)) _ = checkpoint.restore(ckpt_manager.latest_checkpoint) target_scales = np.array( [145.34593924, 145.57711889, 432.92148524, 281.44161905, 1, 1] * topreco.n_max_tops).reshape((topreco.n_max_tops, -1)).T.reshape((-1, )) target_mean = np.array([ 6.74674671e-02, -6.17142186e-02, 4.18239305e-01, 4.24881531e+02, 0, 0 ] * topreco.n_max_tops).reshape((topreco.n_max_tops, -1)).T.reshape((-1, )) # training loss def loss_fcn(target_op, output_ops): # print("target size: ", target_op.nodes.shape) # print("output size: ", output_ops[0].nodes.shape) # output_op = output_ops[-1] # print("loss of 4-vect: ", tf.nn.l2_loss((target_op.nodes[:, :4] - output_op.nodes[:topreco.n_max_tops, :4]))) # print("loss of charge: ", tf.math.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(tf.cast(target_op.nodes[:, 4:6], tf.int32), output_op.nodes[:topreco.n_max_tops, 4:6]))) # print("loss of predictions: ", tf.compat.v1.losses.log_loss(tf.cast(target_op.nodes[:, 6], tf.int32), tf.math.sigmoid(output_op.nodes[:topreco.n_max_tops, 6]))) # loss_ops = [tf.nn.l2_loss((target_op.nodes[:, :4] - output_op.nodes[:topreco.n_max_tops, :4])) # + tf.math.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(tf.cast(target_op.nodes[:, 4:6], tf.int32), output_op.nodes[:topreco.n_max_tops, 4:6])) # + tf.compat.v1.losses.log_loss(tf.cast(target_op.nodes[:, 6], tf.int32), tf.math.sigmoid(output_op.nodes[:topreco.n_max_tops, 6])) # for output_op in output_ops # ] # loss_ops = [tf.nn.l2_loss((target_op.globals[:, :topreco.n_max_tops*4] - output_op.globals[:, :topreco.n_max_tops*4])) / target_op.globals.shape[0] # + tf.compat.v1.losses.log_loss( # tf.cast(target_op.globals[:, topreco.n_max_tops*4:topreco.n_max_tops*5], tf.int32),\ # tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*4:topreco.n_max_tops*5])) # + tf.compat.v1.losses.log_loss( # tf.cast(target_op.globals[:, topreco.n_max_tops*5:], tf.int32),\ # tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*5:])) # for output_op in output_ops # ] # alpha = tf.constant(1, dtype=tf.float32) # loss_ops = [alpha * tf.compat.v1.losses.mean_squared_error(target_op.globals[:, :topreco.n_max_tops*4], output_op.globals[:, :topreco.n_max_tops*4]) # + tf.compat.v1.losses.log_loss( # tf.cast(target_op.globals[:, topreco.n_max_tops*4:], tf.int32),\ # tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*4:])) # for output_op in output_ops # ] # loss_ops = [ tf.nn.l2_loss((target_op.globals[:, :topreco.n_max_tops*4] - output_op.globals[:, :topreco.n_max_tops*4])) # for output_op in output_ops # ] loss_ops = [ tf.compat.v1.losses.absolute_difference( target_op.globals[:, :topreco.n_max_tops*4],\ output_op.globals[:, :topreco.n_max_tops*4]) for output_op in output_ops ] # loss_ops = [tf.compat.v1.losses.mean_squared_error(target_op.globals[:, :topreco.n_max_tops*4], output_op.globals[:, :topreco.n_max_tops*4]) # for output_op in output_ops # ] return tf.stack(loss_ops) @functools.partial(tf.function, input_signature=input_signature) def train_step(inputs_tr, targets_tr, first_batch): print("Tracing update_step") print("inputs nodes", inputs_tr.nodes.shape) print("inputs edges", inputs_tr.edges.shape) print("input n_node", inputs_tr.n_node.shape) print(inputs_tr.nodes) with tf.GradientTape() as tape: outputs_tr = model(inputs_tr, num_processing_steps_tr, is_training=True) loss_ops_tr = loss_fcn(targets_tr, outputs_tr) loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant( num_processing_steps_tr, dtype=tf.float32) # Horovod: add Horovod Distributed GradientTape. if args.distributed: tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss_op_tr, model.trainable_variables) optimizer.apply(gradients, model.trainable_variables) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if args.distributed and first_batch: hvd.broadcast_variables(model.trainable_variables, root_rank=0) hvd.broadcast_variables(optimizer.variables, root_rank=0) return loss_op_tr def train_epoch(dataset): total_loss = 0. batch = 0 for inputs in tqdm(loop_dataset(dataset, args.batch_size)): input_tr, targets_tr = inputs new_target = (targets_tr.globals - target_mean) / target_scales targets_tr = targets_tr.replace(globals=new_target) total_loss += train_step(input_tr, targets_tr, batch == 0).numpy() batch += 1 logging.info("total batches: {}".format(batch)) return total_loss / batch, batch # return total_loss/batch/args.batch_size, batch out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) out_str += '\n' out_str += "Epoch, Time [mins], Loss\n" log_name = os.path.join(output_dir, "training_log.txt") if dist.rank == 0: with open(log_name, 'a') as f: f.write(out_str) now = time.time() for epoch in range(n_epochs): logging.info("start epoch {} on {}".format(epoch, device)) # shuffle the dataset before training training_dataset = training_dataset.shuffle( args.shuffle_size, seed=12345, reshuffle_each_iteration=True) loss, batches = train_epoch(training_dataset) this_epoch = time.time() logging.info( "{} epoch takes {:.2f} mins with loss {:.4f} in {} batches".format( epoch, (this_epoch - now) / 60., loss, batches)) out_str = "{}, {:.2f}, {:.4f}\n".format(epoch, (this_epoch - now) / 60., loss) now = this_epoch if dist.rank == 0: with open(log_name, 'a') as f: f.write(out_str) ckpt_manager.save() if dist.rank == 0: out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) + "\n" with open(log_name, 'a') as f: f.write(out_log)
def train_and_evaluate(args): dist = init_workers(args.distributed) device = 'CPU' global_batch_size = 1 gpus = tf.config.experimental.list_physical_devices("GPU") logging.info("found {} GPUs".format(len(gpus))) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if len(gpus) > 0: device = "{}GPUs".format(len(gpus)) if gpus and args.distributed: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') output_dir = utils_dir.gnn_models if args.output_dir is None else args.output_dir if dist.rank == 0: os.makedirs(output_dir, exist_ok=True) logging.info("Checkpoints and models saved at {}".format(output_dir)) num_processing_steps_tr = args.num_iters ## level of message-passing n_epochs = args.max_epochs logging.info("{} epochs with batch size {}".format(n_epochs, global_batch_size)) logging.info( "{} processing steps in the model".format(num_processing_steps_tr)) logging.info("I am in hvd rank: {} of total {} ranks".format( dist.rank, dist.size)) if dist.rank == 0: train_input_dir = os.path.join( utils_dir.gnn_inputs, 'train') if args.train_files is None else args.train_files val_input_dir = os.path.join( utils_dir.gnn_inputs, 'val') if args.val_files is None else args.val_files train_files = tf.io.gfile.glob(os.path.join(train_input_dir, "*")) eval_files = tf.io.gfile.glob(os.path.join(val_input_dir, "*")) ## split the number of files evenly to all ranks train_files = [ x.tolist() for x in np.array_split(train_files, dist.size) ] eval_files = [ x.tolist() for x in np.array_split(eval_files, dist.size) ] else: train_files = None eval_files = None if args.distributed: train_files = dist.comm.scatter(train_files, root=0) eval_files = dist.comm.scatter(eval_files, root=0) else: train_files = train_files[0] eval_files = eval_files[0] logging.info( "rank {} has {} training files and {} evaluation files".format( dist.rank, len(train_files), len(eval_files))) raw_dataset = tf.data.TFRecordDataset(train_files) training_dataset = raw_dataset.map(graph.parse_tfrec_function) AUTO = tf.data.experimental.AUTOTUNE training_dataset = training_dataset.prefetch(AUTO) with_batch_dim = False inputs, targets = next(training_dataset.take(1).as_numpy_iterator()) input_signature = (graph.specs_from_graphs_tuple(inputs, with_batch_dim), graph.specs_from_graphs_tuple(targets, with_batch_dim), tf.TensorSpec(shape=[], dtype=tf.bool)) learning_rate = args.learning_rate optimizer = snt.optimizers.Adam(learning_rate) # optimizer = tf.optimizers.Adam(learning_rate) model = SegmentClassifier() checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt_manager = tf.train.CheckpointManager(checkpoint, directory=output_dir, max_to_keep=5, keep_checkpoint_every_n_hours=8) logging.info("Loading latest checkpoint from: {}".format(output_dir)) status = checkpoint.restore(ckpt_manager.latest_checkpoint) # training loss real_weight = args.real_edge_weight fake_weight = args.fake_edge_weight def create_loss_ops(target_op, output_ops): weights = target_op.edges * real_weight + ( 1 - target_op.edges) * fake_weight loss_ops = [ tf.compat.v1.losses.log_loss(target_op.edges, tf.squeeze(output_op.edges), weights=weights) for output_op in output_ops ] return tf.stack(loss_ops) @functools.partial(tf.function, input_signature=input_signature) def train_step(inputs_tr, targets_tr, first_batch): print("Tracing update_step") print("inputs nodes", inputs_tr.nodes.shape) print("inputs edges", inputs_tr.edges.shape) print("input n_node", inputs_tr.n_node.shape) print(inputs_tr.nodes) with tf.GradientTape() as tape: outputs_tr = model(inputs_tr, num_processing_steps_tr) loss_ops_tr = create_loss_ops(targets_tr, outputs_tr) loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant( num_processing_steps_tr, dtype=tf.float32) # Horovod: add Horovod Distributed GradientTape. if args.distributed: tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss_op_tr, model.trainable_variables) # optimizer.apply_gradients(zip(gradients, model.trainable_variables)) optimizer.apply(gradients, model.trainable_variables) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if args.distributed and first_batch: hvd.broadcast_variables(model.trainable_variables, root_rank=0) hvd.broadcast_variables(optimizer.variables, root_rank=0) return loss_op_tr def train_epoch(dataset): total_loss = 0. num_batches = 0 for batch, inputs in enumerate(dataset): input_tr, target_tr = inputs total_loss += train_step(input_tr, target_tr, batch == 0) num_batches += 1 logging.info("total batches: {}".format(num_batches)) return total_loss / num_batches out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) out_str += '\n' out_str += "Epoch, Time [mins], Loss\n" log_name = os.path.join(output_dir, "training_log.txt") if dist.rank == 0: with open(log_name, 'a') as f: f.write(out_str) now = time.time() for epoch in range(n_epochs): logging.info("start epoch {} on {}".format(epoch, device)) loss = train_epoch(training_dataset) this_epoch = time.time() logging.info("Training {} epoch, {:.2f} mins, Loss := {:.4f}".format( epoch, (this_epoch - now) / 60., loss / global_batch_size)) out_str = "{}, {:.2f}, {:.4f}\n".format(epoch, (this_epoch - now) / 60., loss / global_batch_size) now = this_epoch if dist.rank == 0: with open(log_name, 'a') as f: f.write(out_str) ckpt_manager.save() if dist.rank == 0: out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) + "\n" with open(log_name, 'a') as f: f.write(out_log)
def train(self, train_truth, epochs, batch_size, test_truth, log_dir, evaluate_samples_fn, train_in=None, test_in=None): # ====================================== # construct testing data once for all # ====================================== AUTO = tf.data.experimental.AUTOTUNE noise = np.random.normal(loc=0., scale=1., size=(test_truth.shape[0], self.noise_dim)) test_in = np.concatenate( [test_in, noise], axis=1).astype(np.float32) if test_in is not None else noise testing_data = tf.data.Dataset.from_tensor_slices( (test_in, test_truth)).batch(batch_size, drop_remainder=True).prefetch(AUTO) # ==================================== # Checkpoints and model summary # ==================================== checkpoint_dir = os.path.join(log_dir, "checkpoints") checkpoint = tf.train.Checkpoint( generator=self.generator, discriminator=self.discriminator) ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=None) logging.info("Loading latest checkpoint from: {}".format(checkpoint_dir)) _ = checkpoint.restore(ckpt_manager.latest_checkpoint).expect_partial() summary_dir = os.path.join(log_dir, "logs") summary_writer = tf.summary.create_file_writer(summary_dir) img_dir = os.path.join(log_dir, 'img') os.makedirs(img_dir, exist_ok=True) @tf.function def train_step(gen_in_4vec, cond_in, truth_4vec): with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape: gen_out_4vec = self.generator(gen_in_4vec, training=True) # ============================================================= # add the conditional inputs to generated and truth information # ============================================================= gen_out_4vec = tf.concat([cond_in, gen_out_4vec], axis=-1) truth_4vec = tf.concat([cond_in, truth_4vec], axis=-1) # apply discriminator real_output = self.discriminator(truth_4vec, training=True) fake_output = self.discriminator(gen_out_4vec, training=True) gen_loss = generator_loss(fake_output) disc_loss = discriminator_loss(real_output, fake_output) gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables) gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables) self.generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables)) self.discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables)) return disc_loss, gen_loss best_wdis = 9999 best_epoch = -1 with tqdm.trange(epochs, disable=self.disable_tqdm) as t0: for epoch in t0: # compose the training dataset by generating different noises for each epochs noise = np.random.normal(loc=0., scale=1., size=(train_truth.shape[0], self.noise_dim)) train_inputs = np.concatenate( [train_in, noise], axis=1).astype(np.float32) if train_in is not None else noise dataset = tf.data.Dataset.from_tensor_slices( (train_inputs, train_in, train_truth) ).shuffle(2*batch_size).batch(batch_size, drop_remainder=True).prefetch(AUTO) tot_loss = [] for data_batch in dataset: tot_loss.append(list(train_step(*data_batch))) tot_loss = np.array(tot_loss) avg_loss = np.sum(tot_loss, axis=0)/tot_loss.shape[0] loss_dict = dict(D_loss=avg_loss[0], G_loss=avg_loss[1]) tot_wdis = evaluate_samples_fn(self.generator, epoch, testing_data, summary_writer, img_dir, **loss_dict) if tot_wdis < best_wdis: ckpt_manager.save() self.generator.save("generator") best_wdis = tot_wdis best_epoch = epoch t0.set_postfix(**loss_dict, BestD=best_wdis, BestE=best_epoch) tmp_res = "Best Model in {} Epoch with a Wasserstein distance {:.4f}".format(best_epoch, best_wdis) logging.info(tmp_res) summary_logfile = os.path.join(summary_dir, 'results.txt') with open(summary_logfile, 'a') as f: f.write(tmp_res + "\n")
def train_and_evaluate(args): for key, value in vars(args).items(): print("{} --> {}".format(key, value)) use_tpu = args.tpu is not None device = 'CPU' global_batch_size = args.train_batch_size if not use_tpu else args.tpu_cores physical_gpus = tf.config.experimental.list_physical_devices("GPU") n_gpus = len(physical_gpus) if use_tpu: if args.tpu == 'colab': resolver = tf.distribute.cluster_resolver.TPUClusterResolver() else: resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=args.tpu, zone=args.zone) workers = resolver.cluster_spec().as_dict()['worker'] n_tpus = len(workers) logging.info('Running on {} TPUs '.format(n_tpus)) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = snt.distribute.TpuReplicator(resolver) device = 'TPU' elif n_gpus > 1: logging.info("Useing SNT Replicator with {} GPUs".format(n_gpus)) strategy = snt.distribute.Replicator(['/device:GPU:{}'.format(i) for i in range(n_gpus)],\ tf.distribute.ReductionToOneDevice("CPU:0")) device = "{}GPUs".format(n_gpus) for dd in physical_gpus: tf.config.experimental.set_memory_growth(dd, True) elif n_gpus > 0: strategy = tf.distribute.OneDeviceStrategy("/device:GPU:0") device = "1GPU" else: strategy = tf.distribute.OneDeviceStrategy("/device:CPU:0") if n_gpus > 0: assert n_gpus == global_batch_size, "batch size {} does not equall to GPUs {}".format( global_batch_size, n_gpus) else: pass # assert global_batch_size == 1, "batch size {} does not equall to 1".format(global_batch_size) output_dir = args.job_dir if not use_tpu: os.makedirs(output_dir, exist_ok=True) logging.info("Checkpoints and models saved at {}".format(output_dir)) num_processing_steps_tr = args.num_iters ## level of message-passing n_epochs = args.num_epochs logging.info("{} epochs with batch size {}".format(n_epochs, global_batch_size)) logging.info( "{} processing steps in the model".format(num_processing_steps_tr)) # prepare graphs logging.info("{} Eta bins and {} Phi bins".format(args.num_eta_bins, args.num_phi_bins)) _, max_edges = graph.get_max_graph_size(args.num_eta_bins, args.num_phi_bins) # train_files = tf.io.gfile.glob(args.train_files) # eval_files = tf.io.gfile.glob(args.eval_files) file_names = tf.io.gfile.glob(args.input_files) n_files = len(file_names) n_train = int(0.9 * n_files) if n_train < 1: n_train = 1 # logging.info("Input file names: ", file_names) logging.info("{} input files".format(n_files)) # logging.info("{} training files and {} evaluation files".format(len(train_files), len(eval_files))) logging.info("{} training files".format(n_train)) raw_dataset = tf.data.TFRecordDataset(file_names[:n_train]) training_dataset = raw_dataset.map(graph.parse_tfrec_function) AUTO = tf.data.experimental.AUTOTUNE options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA training_dataset = training_dataset.with_options(options) training_dataset = training_dataset.batch( global_batch_size, drop_remainder=True).prefetch(AUTO) learning_rate = args.learning_rate with strategy.scope(): optimizer = snt.optimizers.Adam(learning_rate) # model = SegmentClassifier() model = get_model(args.model_name) checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt_manager = tf.train.CheckpointManager(checkpoint, directory=output_dir, max_to_keep=5) logging.info("Loading latest checkpoint from: {}".format(output_dir)) _ = checkpoint.restore(ckpt_manager.latest_checkpoint) # training loss real_weight = args.real_edge_weight fake_weight = args.fake_edge_weight def log_loss(label, prediction, eps=1e-7, weights=1.): # tf.compat.v1.losses.log_loss, not supported by TPU # copy the TF source code here loss = tf.negative( tf.add(tf.multiply(label, tf.math.log(prediction + eps)), tf.multiply((1 - label), tf.math.log(1 - prediction + eps)))) loss = tf.multiply(loss, weights) present = tf.where(tf.math.equal(weights, 0.0), tf.zeros_like(weights), tf.ones_like(weights)) loss = tf.math.divide_no_nan(tf.math.reduce_sum(loss), tf.math.reduce_sum(present)) return loss def create_loss_ops(target_op, output_ops): t_edges = tf.squeeze(target_op.edges) weights = t_edges * real_weight + (1 - t_edges) * fake_weight row_index = tf.range(tf.constant(max_edges)) n_valid_edges = target_op.n_edge[0] # # NOTE: this implementation is very low # # cond = (row_index < n_valid_edges) # # zeros = tf.zeros_like(weights, dtype=weights.dtype) # # weights = tf.where(cond, weights, zeros) mask = tf.cast(row_index < n_valid_edges, tf.float32) # mask = tf.expand_dims(mask, axis=1) weights = weights * mask loss_ops = [ tf.compat.v1.losses.log_loss(t_edges, tf.squeeze(output_op.edges), weights=weights) for output_op in output_ops ] return tf.stack(loss_ops) # if use_tpu: # cast_down_tensor = functools.partial(tf.cast, dtype=tf.float16) # else: # cast_down_tensor = functools.partial(tf.cast, dtype=tf.bfloat16) @tf.function(autograph=False) def train_step(inputs_tr, targets_tr): print("Tracing train_step") print(inputs_tr) def update_step(inputs_tr, targets_tr): # logging.info("Tracing update_step") # logging.info("before contatenate:", inputs_tr.n_node.shape) inputs_tr = graph.concat_batch_dim(inputs_tr) targets_tr = graph.concat_batch_dim(targets_tr) # if args.mix_precision: # inputs_tr = inputs_tr.map(cast_down_tensor) # targets_tr = targets.map(cast_down_tensor) # logging.info("after concatenate:", inputs_tr.n_node.shape) with tf.GradientTape() as tape: outputs_tr = model(inputs_tr, num_processing_steps_tr) loss_ops_tr = create_loss_ops(targets_tr, outputs_tr) loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant( num_processing_steps_tr, dtype=tf.float32) gradients = tape.gradient(loss_op_tr, model.trainable_variables) # aggregate the gradients from the full batch. # this is not there for mirror strategy replica_ctx = tf.distribute.get_replica_context() gradients = replica_ctx.all_reduce("mean", gradients) optimizer.apply(gradients, model.trainable_variables) return loss_op_tr per_example_losses = strategy.run(update_step, args=(inputs_tr, targets_tr)) mean_loss = strategy.reduce("sum", per_example_losses, axis=None) return mean_loss def train_epoch(dataset): total_loss = 0. num_batches = 0 for inputs in dataset: input_tr, target_tr = inputs total_loss += train_step(input_tr, target_tr) num_batches += 1 logging.info("total batches: {}".format(num_batches)) return total_loss / num_batches # this_time = time.strftime('%d %b %Y %H:%M:%S', time.localtime()) out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) out_str += '\n' out_str += "Epoch, Time [mins], Loss\n" log_name = os.path.join(output_dir, "training_log.txt") print(out_str) # with open(log_name, 'a') as f: # f.write(out_str) now = time.time() # writer = tf.summary.create_file_writer(os.path.join(output_dir, this_time)) dist_training_dataset = strategy.experimental_distribute_dataset( training_dataset) for epoch in range(n_epochs): logging.info("start epoch {} on {}".format(epoch, device)) # training_dataset = training_dataset.shuffle(global_batch_size*2, reshuffle_each_iteration=True) loss = train_epoch(dist_training_dataset) # loss = train_epoch(training_dataset) this_epoch = time.time() logging.info("Training {} epoch, {:.2f} mins, Loss := {:.4f}".format( epoch, (this_epoch - now) / 60., loss / global_batch_size)) out_str = "{}, {:.2f}, {:.4f}\n".format(epoch, (this_epoch - now) / 60., loss / global_batch_size) # with open(log_name, 'a') as f: # f.write(out_str) # with writer.as_default(): # tf.sumary.scalar("") now = this_epoch ckpt_manager.save() out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S', time.localtime()) + "\n" print(out_log)
def train(args): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG) # Distribution Strategy os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' # TODO Implement on multi-nodes SLURM strategy = tf.distribute.MirroredStrategy() session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True # Allow full memory usage of GPU. warm_start = None if args.load_model: # Load model model_path = paths['save'] + '/' + args.load_model eval_path = model_path + '/eval' else: trial = 0 while os.path.exists(paths['save'] + '/{}_trial_{}'.format(args.modality, trial)): trial += 1 model_path = paths['save'] + '/{}_trial_{}'.format( args.modality, trial) eval_path = model_path + '/eval' train_input_fn = DatasetHandler('train', args) eval_input_fn = DatasetHandler('eval', args) train_size = len(train_input_fn) eval_size = len(eval_input_fn) if args.mode == 'test': train_size = 20 eval_size = 10 steps_per_epoch = np.ceil(train_size / args.batch_size) max_training_steps = args.epochs * steps_per_epoch model_fn_params = { 'batch_norm': args.no_bn, 'dropout': args.dropout, 'classes': args.classes, 'lr': args.lr, 'decay_rate': args.decay_rate, 'decay_steps': np.ceil(args.epochs * steps_per_epoch / (args.decays_per_train + 1)), 'eval_path': eval_path, 'eval_steps': eval_size } configuration = tf.estimator.RunConfig( tf_random_seed=args.seed, save_summary_steps=steps_per_epoch, keep_checkpoint_max=args.early_stop + 2, save_checkpoints_steps=steps_per_epoch, log_step_count_steps=np.ceil(steps_per_epoch / 2), train_distribute=strategy, session_config=session_config) liver_seg = tf.estimator.Estimator(model_fn=unet_model_fn, model_dir=model_path, params=model_fn_params, config=configuration, warm_start_from=warm_start) es_steps = steps_per_epoch * args.early_stop early_stopping = stop_if_no_decrease_hook( liver_seg, metric_name='loss', max_steps_without_decrease=es_steps) profiler_hook = tf.estimator.ProfilerHook(save_steps=int( max_training_steps / 5), show_memory=True, output_dir=model_path) log_data = { 'train_size': train_size, 'steps_per_epoch': steps_per_epoch, 'max_training_steps': max_training_steps, 'eval_size': eval_size, 'eval_steps': eval_size, 'model_path': model_path } save_logs(args, log_data) train_spec = tf.estimator.TrainSpec( input_fn=lambda: train_input_fn.input_fn(), hooks=[profiler_hook, early_stopping], max_steps=max_training_steps) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: eval_input_fn.input_fn(), steps=eval_size, start_delay_secs=0, throttle_secs=0) tf.estimator.train_and_evaluate(liver_seg, train_spec=train_spec, eval_spec=eval_spec) info( 'Train and Evaluation Mode Finished!\n Metrics and checkpoints are saved at:' '\n {}\n ----------'.format(model_path))
def _log_data_augmentation(data_augmentation, name): """Logs the given data augmentation parameters for diagnostic purposes.""" if not data_augmentation: logging.info('No data augmentation provided for %s', name) else: logging.info('%s augmentations:', name) logging.info('enable_jitter: %s', data_augmentation.enable_jitter) logging.info('jitter_amount: %d', data_augmentation.jitter_amount) logging.info('enable_gaussian_noise: %s', data_augmentation.enable_gaussian_noise) logging.info('gaussian_noise_std: %s', data_augmentation.gaussian_noise_std)