Пример #1
0
    def predict(self, test_paths: Union[str, Iterable[str]]) -> None:
        """
        Generate predictions from plain text corpora at a given path or list of paths.
        :param test_paths: paths to documents for which to generate predictions
        """
        if not isinstance(test_paths, Iterable):
            test_paths = [test_paths]

        if not self._feature_extractor:
            self._init_feature_extractor()

        # initialize predictor from saved trained model
        predictor = from_job_dir(self._job_dir)

        for test_set in test_paths:
            prediction_path = os.path.join(
                self._job_dir,
                os.path.basename(test_set) + '.predictions.txt')
            logging.info('Writing predictions on %s to %s' %
                         (test_set, prediction_path))
            with file_io.FileIO(prediction_path, mode="w") as output:
                with file_io.FileIO(test_set, mode="r") as text_lines:
                    for line in text_lines:
                        line = line.strip()
                        if not line:
                            continue
                        predictions = predictor.predict(line)
                        for prediction in predictions:
                            output.write(str(prediction) + '\n')
                        output.write('\n')
def main(_):
    label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path)

    examples_list = glob(os.path.join('images', '*.jpg'))

    logging.info('Found {} images'.format(len(examples_list)))

    # We automatically split into training and validation set
    random.seed(42)
    random.shuffle(examples_list)
    num_examples = len(examples_list)
    num_train = int(0.7 * num_examples)
    train_examples = examples_list[:num_train]
    val_examples = examples_list[num_train:]

    logging.info('%d training and %d validation examples.',
                 len(train_examples), len(val_examples))

    train_output_path = os.path.join(FLAGS.output_dir, 'qcards_train.record')
    val_output_path = os.path.join(FLAGS.output_dir, 'qcards_val.record')

    writer = tf.io.TFRecordWriter(train_output_path)
    for example in tqdm(train_examples):
        tf_example = create_tf_example(example, label_map_dict=label_map_dict)
        writer.write(tf_example.SerializeToString())

    writer.close()

    writer = tf.io.TFRecordWriter(val_output_path)
    for example in tqdm(val_examples):
        tf_example = create_tf_example(example, label_map_dict=label_map_dict)
        writer.write(tf_example.SerializeToString())

    writer.close()
def main(unused_argv):
    tf.compat.v1.logging.set_verbosity(logging.INFO)
    paths = tf.io.gfile.glob(FLAGS.input_data_pattern)
    logging.info("Found %s files.", len(paths))
    for path in paths:
        with tf.io.gfile.GFile(path, "rb") as f:
            first_read = True
            while True:
                length_raw = f.read(8)
                if not length_raw and first_read:
                    logging.fatal("File %s has no data.", path)
                    break
                elif not length_raw:
                    logging.info("File %s looks good.", path)
                    break
                else:
                    first_read = False
                if len(length_raw) != 8:
                    logging.fatal("File ends when reading record length: " +
                                  path)
                    break
                length, = struct.unpack("Q", length_raw)
                # +8 to include the crc values.
                record = f.read(length + 8)
                if len(record) != length + 8:
                    logging.fatal("File ends in the middle of a record: " +
                                  path)
                    break
Пример #4
0
    def eval(self, test_paths: Union[str, Iterable[str]]) -> None:
        """
        Evaluate a trained model on a given corpus or list of corpora.
        :param test_paths: paths to test corpora
        """
        if not isinstance(test_paths, Iterable):
            test_paths = [test_paths]

        if not self._feature_extractor:
            self._init_feature_extractor()

        # initialize predictor from saved trained model
        predictor = from_job_dir(self._job_dir)
        # get function that is used to evaluate predictions from configuration

        for test_set in test_paths:
            logging.info('Evaluating on %s' % test_set)
            output_path = os.path.join(self._job_dir,
                                       os.path.basename(test_set) + '.eval')

            evaluation_fn = get_evaluator(self._training_config.heads,
                                          self._feature_extractor, output_path,
                                          self._eval_script_path)

            # extract instances from test file at given path--this is a generator, so wrap in a list
            # predict from instances instead of raw text, so use .predict_inputs, don't format since we need the raw predictions
            processed_examples = predictor.predict_parsed(self._extract_raw(
                test_set, True),
                                                          formatted=False)
            # call evaluation function on predictions
            evaluation_fn(self._extract_raw(test_set, True),
                          processed_examples)
Пример #5
0
 def _extract_and_write(self, path: str, test: bool = False):
     output_path = self._data_path_fn(path)
     if gfile.exists(output_path):
         logging.info("Using pre-existing features for %s from %s", path,
                      output_path)
         return
     examples = self._extract_features(path, test)
     write_features(examples, output_path)
Пример #6
0
 def train_epoch(dataset):
     total_loss = 0.
     num_batches = 0
     for batch, inputs in enumerate(dataset):
         input_tr, target_tr = inputs
         total_loss += train_step(input_tr, target_tr, batch == 0)
         num_batches += 1
     logging.info("total batches: {}".format(num_batches))
     return total_loss / num_batches
Пример #7
0
 def _train_vocab(self, train_path: str):
     logging.info("Creating new vocabulary using training data at %s",
                  train_path)
     self._feature_extractor.initialize(self._resources)
     self._feature_extractor.train(self._extract_raw(train_path))
     logging.info("Writing new feature/label vocabulary to %s",
                  self._vocab_path)
     self._feature_extractor.write_vocab(self._vocab_path,
                                         resources=self._resources,
                                         prune=True)
Пример #8
0
def restore_ckpt(sess, ckpt):
    saver = tf.train.Saver()
    logging.info(f"[LOAD]{ckpt}")
    try:
        saver.restore(sess, ckpt)
    except:
        print("======LOAD ERROR======")
        print_variables()
        print_ckpt(sess, ckpt)
        raise Exception
    return saver
Пример #9
0
def get_embedding_input(inputs, feature, training, weights=None):
    config = feature.config

    with variable_scope(feature.name):
        with variable_scope('embedding'):
            initializer = None
            if training:
                if feature.embedding is not None:
                    initializer = embedding_initializer(feature.embedding)
                elif config.initializer.zero_init:
                    logging.info("Zero init for feature embedding: %s",
                                 feature.name)
                    initializer = tf.zeros_initializer
                else:
                    logging.info(
                        "Xavier Uniform init for feature embedding: %s",
                        feature.name)
                    initializer = tf.glorot_uniform_initializer

            embedding_matrix = get_variable(
                name='parameters',
                shape=[feature.vocab_size(), config.dim],
                initializer=initializer,
                trainable=config.trainable)

            if weights is None:
                feature_ids = string2index(inputs, feature)
                result = tf.nn.embedding_lookup(
                    params=embedding_matrix, ids=feature_ids,
                    name='lookup')  # wrapper of gather
            else:
                result = tf.matmul(weights,
                                   embedding_matrix,
                                   name="weighted_lookup")

            if config.dropout > 0:
                result = tf.layers.dropout(result,
                                           rate=config.dropout,
                                           training=training,
                                           name='dropout')

        if 'func' in config:  # reduce multiple vectors per token to a single vector
            with tf.name_scope('reduce'):
                result = config.func.apply(result)

        if config.word_dropout > 0 and training:
            shape = tf.shape(result)
            result = tf.layers.dropout(result,
                                       rate=config.word_dropout,
                                       training=training,
                                       name='word_dropout',
                                       noise_shape=[shape[0], shape[1], 1])

        return result
Пример #10
0
 def train_epoch(dataset):
     total_loss = 0.
     batch = 0
     for inputs in tqdm(loop_dataset(dataset, args.batch_size)):
         input_tr, targets_tr = inputs
         new_target = (targets_tr.globals - target_mean) / target_scales
         targets_tr = targets_tr.replace(globals=new_target)
         total_loss += train_step(input_tr, targets_tr, batch == 0).numpy()
         batch += 1
     logging.info("total batches: {}".format(batch))
     return total_loss / batch, batch
Пример #11
0
def main(_):
    feat_dict = FeatureDictionary()
    print("feature_size: %d" % feat_dict.feature_size)
    print("field_size: %d" % feat_dict.field_size)
    print(feat_dict.col2feat_id.keys())
    dataparser = DataParser(feat_dict, FLAGS.label)
    train_ids, train_vals, train_labels = dataparser.parse(infile="%s\\train_sample.csv" % FLAGS.data_dir)
    print("len of train: %d" % len(train_ids))
    test_ids, test_vals, test_labels = dataparser.parse(infile="%s\\test_sample.csv" % FLAGS.data_dir)
    print("len of test: %d" % len(test_ids))

    # ------bulid Tasks------
    model_params = {
        "field_size": feat_dict.field_size,
        "feature_size": feat_dict.feature_size,
        "embedding_size": FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "l2_reg": FLAGS.l2_reg,
        "deep_layers": FLAGS.deep_layers,
        "dropout": FLAGS.dropout,
        "experts_num": 3,
        "experts_units": 32,
        "use_experts_bias": True,
        "use_gate_bias": True
    }
    print(model_params)
    DeepFM = build_model_estimator(model_params)
    # DeepFM = tf.contrib.estimator.add_metrics(DeepFM, my_auc)

    if FLAGS.task_type == 'train':
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(train_ids, train_vals, train_labels,
                                                                      num_epochs=FLAGS.num_epochs,
                                                                      batch_size=FLAGS.batch_size))
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(test_ids, test_vals, test_labels,
                                                                    num_epochs=1,
                                                                    batch_size=FLAGS.batch_size),
                                          steps=None, start_delay_secs=1000, throttle_secs=1200)
        tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec)
        results = DeepFM.evaluate(
            input_fn=lambda: input_fn(test_ids, test_vals, test_labels, num_epochs=1, batch_size=FLAGS.batch_size))
        for key in results:
            log.info("%s : %s" % (key, results[key]))
    elif FLAGS.task_type == 'eval':
        results = DeepFM.evaluate(input_fn=lambda: input_fn(test_ids, test_vals, test_labels,
                                                            num_epochs=1, batch_size=FLAGS.batch_size))
        for key in results:
            log.info("%s : %s" % (key, results[key]))
    elif FLAGS.task_type == 'infer':
        preds = DeepFM.predict(input_fn=lambda: input_fn(test_ids, test_vals, test_labels,
                                                         num_epochs=1, batch_size=FLAGS.batch_size),
                               predict_keys="prob")
        with open(FLAGS.data_dir+"/pred.txt", "w") as fo:
            for prob in preds:
                fo.write("%f\n" % (prob['prob']))
Пример #12
0
def embedding(features, feature_config, training):
    if feature_config.name == constants.ELMO_KEY:
        logging.info("Using ELMo module at %s", ELMO_URL)
        elmo_module = hub.Module(ELMO_URL, trainable=True)
        elmo_embedding = elmo_module(inputs={
            'tokens':
            features[constants.ELMO_KEY],
            'sequence_len':
            tf.cast(features[constants.LENGTH_KEY], dtype=tf.int32)
        },
                                     signature="tokens",
                                     as_dict=True)['elmo']
        return elmo_embedding
    elif feature_config.name == constants.BERT_KEY:
        model = feature_config.options.get("model")
        logging.info("Using BERT module at %s", model)
        tags = set()
        if training:
            tags.add("train")
        bert_module = hub.Module(model, tags=tags, trainable=True)

        lens = features[constants.LENGTH_KEY]
        if constants.BERT_LENGTH_KEY in features:
            lens = features[constants.BERT_LENGTH_KEY]

        if constants.BERT_SEG_ID in features:
            segment_ids = tf.cast(features[constants.BERT_SEG_ID],
                                  dtype=tf.int32)
        else:
            segment_ids = tf.zeros(tf.shape(features[constants.BERT_KEY]),
                                   dtype=tf.int32)

        bert_inputs = dict(
            input_ids=tf.cast(features[constants.BERT_KEY], tf.int32),
            # mask over the sequence lengths, which extend over all BERT tokens in input_ids for each seq in the batch
            input_mask=tf.cast(tf.sequence_mask(lens), dtype=tf.int32),
            # we don't care about segment_ids since we're not supporting sentence pair tasks for now
            segment_ids=segment_ids)

        bert_outputs = bert_module(bert_inputs,
                                   signature="tokens",
                                   as_dict=True)
        output_type = feature_config.options.get("output_type")
        bert_embedding = bert_outputs[output_type]
        if output_type == "pooled_output":
            bert_embedding = tf.expand_dims(bert_embedding, axis=1)
        return bert_embedding

    elif feature_config.has_vocab():
        feature_embedding = get_embedding_input(features[feature_config.name],
                                                feature_config, training)
        return feature_embedding
Пример #13
0
    def __init__(self, optimizer_config, **kwargs):
        super().__init__(**optimizer_config, **kwargs)
        self.name = optimizer_config.name
        self.params = optimizer_config.params if optimizer_config.get(
            'params') else {}

        clip = optimizer_config.get('clip')
        if not clip:
            clip = 5.0
            logging.info(
                "Using default global norm of gradient clipping threshold of %f",
                clip)
        self.clip = clip
Пример #14
0
def _copy_file(src_dir: str, dest_dir: str, file_name: str):
    src_path = src_dir + '/' + file_name
    dest_path = dest_dir + '/' + file_name
    for retries in range(0, 10):
        try:
            gfile.copy(src_path, dest_path, overwrite=True)
            logging.info("copy %s->%s succeeded (retry %d)", src_path,
                         dest_path, retries)
            return
        except tf.errors.OpError as ex:
            logging.error("copy %s->%s (retry %d): %s", src_path, dest_path,
                          retries, ex)
            time.sleep(1.5**retries)
Пример #15
0
def get_optimizer(
    network_config,
    default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)):
    """
    Return the optimizer given by the input network configuration, or a default optimizer.
    :param network_config: network configuration
    :param default_optimizer: default optimization algorithm
    :return: configured optimizer
    """
    try:
        optimizer = network_config.optimizer
    except KeyError:
        logging.info("Using Adadelta as default optimizer.")
        return default_optimizer
    if isinstance(optimizer.lr, numbers.Number):
        lr = optimizer.lr
    else:
        optimizer.lr.num_train_steps = network_config.max_steps
        optimizer.lr.steps_per_epoch = network_config.steps_per_epoch
        lr = get_learning_rate(optimizer.lr, train.get_global_step())

    name = optimizer.name
    params = optimizer.params
    if "Adadelta" == name:
        opt = train.AdadeltaOptimizer(lr, **params)
    elif "Adam" == name:
        opt = train.AdamOptimizer(lr, **params)
    elif "LazyAdam" == name:
        opt = LazyAdamOptimizer(lr, **params)
    elif "LazyNadam" == name:
        opt = LazyNadamOptimizer(lr, **params)
    elif "SGD" == name:
        opt = train.GradientDescentOptimizer(lr)
    elif "Momentum" == name:
        opt = train.MomentumOptimizer(lr, **params)
    elif "Nadam" == name:
        opt = NadamOptimizerSparse(lr, **params)
    elif "bert" == name:
        opt = AdamWeightDecayOptimizer(
            lr,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    else:
        raise ValueError("Invalid optimizer name: {}".format(name))
    return opt
Пример #16
0
def load_model_py(model, model_py, is_train=True, feed_embedded_layer=False, batch_size=None):
    pair = model_py.split(":")
    sys.path.append(os.getcwd())
    if len(pair) >= 2:
        logging.info(f"[LOAD] {pair[1]} from {pair[0]}")
        mod = importlib.import_module(pair[0])
        cls = getattr(mod, pair[1])
        obj = cls()
        if model:
            model.build(obj, is_train, feed_embedded_layer, batch_size)
        return obj
    else:
        logging.info(f"[LOAD] {pair[0]}")
        mod = importlib.import_module(pair[0])
        if model:
            model.build(mod, is_train, feed_embedded_layer, batch_size)
        return mod
Пример #17
0
    def make_checkpoints(self):
        if self.ckpt_manager:
            return

        output_dir = self.output_dir
        model = self.model
        optimizer = self.optimizer
        ckpt_dir = os.path.join(output_dir, "checkpoints")
        self.checkpoint = tf.train.Checkpoint(
            optimizer=optimizer,
            model=model
            )
        self.ckpt_manager = tf.train.CheckpointManager(
            self.checkpoint, directory=ckpt_dir,
            max_to_keep=20, keep_checkpoint_every_n_hours=1)
        logging.info("Loading latest checkpoint from: {}".format(ckpt_dir))
        _ = self.checkpoint.restore(self.ckpt_manager.latest_checkpoint)
        self.ckpt_dir = ckpt_dir
Пример #18
0
def inference(gan, test_in, test_truth, log_dir, xlabels):
    checkpoint_dir = os.path.join(log_dir, "checkpoints")
    checkpoint = tf.train.Checkpoint(
        generator=gan.generator,
        discriminator=gan.discriminator)
    ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=None)
    logging.info("Loading latest checkpoint from: {}".format(checkpoint_dir))
    _ = checkpoint.restore(ckpt_manager.latest_checkpoint).expect_partial()

    AUTO = tf.data.experimental.AUTOTUNE
    noise = np.random.normal(loc=0., scale=1., size=(test_truth.shape[0], gan.noise_dim))
    test_in = np.concatenate(
        [test_in, noise], axis=1).astype(np.float32) if test_in is not None else noise
    testing_data = tf.data.Dataset.from_tensor_slices(
        (test_in, test_truth)).batch(batch_size, drop_remainder=True).prefetch(AUTO)

    summary_dir = os.path.join(log_dir, "logs_inference")
    summary_writer = tf.summary.create_file_writer(summary_dir)

    img_dir = os.path.join(log_dir, 'img_inference')
    os.makedirs(img_dir, exist_ok=True)
Пример #19
0
def get_l2_loss(network_config, variables):
    if not network_config.optimizer or not network_config.optimizer.get(
            'l2_loss'):
        return 0
    l2_loss = network_config.optimizer.get('l2_loss')

    if isinstance(l2_loss, numbers.Number):
        return tf.add_n(
            [tf.nn.l2_loss(v)
             for v in variables if 'bias' not in v.name]) * l2_loss
    if not isinstance(l2_loss, dict):
        raise ValueError(
            "'l2_loss' expects a dictionary from regular expressions matching variable names to L2 terms,"
            " e.g. {\".*scalar.*\": 0.001}, or a single L2 term to be applied globally to non-bias weights."
        )

    all_losses = []
    for var_pattern, alpha in l2_loss.items():
        for var in [v for v in variables if re.match(var_pattern, v.name)]:
            logging.info('Adding L2 regularization with alpha=%f to %s' %
                         (alpha, var.name))
            all_losses.append(alpha * tf.nn.l2_loss(var))

    return tf.add_n(all_losses)
Пример #20
0
    def _compute_steps(self, train, valid):
        train_count = sum(1 for _ in tf.python_io.tf_record_iterator(
            self._data_path_fn(train)))
        valid_count = sum(1 for _ in tf.python_io.tf_record_iterator(
            self._data_path_fn(valid)))

        steps_per_epoch = train_count // self._training_config.batch_size
        if not self._training_config.max_epochs:
            if not self._training_config.max_steps:
                self._training_config.max_epochs = 100
            else:
                self._training_config.max_epochs = self._training_config.max_steps // steps_per_epoch
        if not self._training_config.patience_epochs:
            self._training_config.patience_epochs = 5
        if not self._training_config.checkpoint_epochs:
            self._training_config.checkpoint_epochs = 1

        max_steps = self._training_config.max_epochs * steps_per_epoch
        patience = self._training_config.patience_epochs * steps_per_epoch
        checkpoint_steps = self._training_config.checkpoint_epochs * steps_per_epoch

        logging.info(
            'Training on %d instances at %s, validating on %d instances at %s'
            % (train_count, train, valid_count, valid))
        logging.info(
            'Training for a maximum of %d epoch(s) (%d steps w/ batch_size=%d)'
            % (self._training_config.max_epochs, max_steps,
               self._training_config.batch_size))
        if patience < max_steps:
            logging.info(
                'Early stopping after %d epoch(s) (%d steps) with no improvement on validation set'
                % (self._training_config.patience_epochs, patience))
        logging.info(
            'Evaluating every %d steps, %d epoch(s)' %
            (checkpoint_steps, self._training_config.checkpoint_epochs))

        return max_steps, patience, checkpoint_steps, steps_per_epoch
Пример #21
0
 def _init_feature_extractor(self, train_path: str = None):
     self._feature_extractor = get_feature_extractor(
         self._training_config.features, self._training_config.heads)
     logging.info(
         "Checking for pre-existing vocabulary at vocabulary at %s",
         self._vocab_path)
     if self._feature_extractor.read_vocab(self._vocab_path):
         logging.info("Loaded pre-existing vocabulary at %s",
                      self._vocab_path)
     elif train_path:
         logging.info(
             "No valid pre-existing vocabulary found at %s "
             "(this is normal when not loading from an existing model)",
             self._vocab_path)
         self._train_vocab(train_path)
     else:
         raise ValueError(
             'No feature vocabulary available at %s and unable to train new vocabulary'
             % self._vocab_path)
Пример #22
0
"""
This is a simple MLP-base conditional GAN.
Same as gan.py except that the conditional input is
given to the discriminator.
"""
import numpy as np
import os


import tensorflow as tf
from tensorflow.compat.v1 import logging
logging.info("TF Version:{}".format(tf.__version__))
gpus = tf.config.experimental.list_physical_devices("GPU")
logging.info("found {} GPUs".format(len(gpus)))
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


from tensorflow import keras
from tensorflow.keras import layers

import tqdm


cross_entropy = keras.losses.BinaryCrossentropy(from_logits=False)
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return tf.reduce_mean(total_loss)
def create_tf_example(example, label_map_dict):
    mask_paths = get_mask_paths(example)

    with tf.io.gfile.GFile(example, 'rb') as fid:
        encoded_jpg = fid.read()

    encoded_jpg_io = io.BytesIO(encoded_jpg)

    image = PIL.Image.open(encoded_jpg_io)

    if image.format != 'JPEG':
        raise ValueError('Image format not JPEG')

    key = hashlib.sha256(encoded_jpg).hexdigest()

    classes = []
    masks = []
    bboxes = []

    for label, mp in mask_paths.items():
        mask = cv2.imread(mp, cv2.IMREAD_UNCHANGED)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
        mask = cv2.threshold(mask, 0, 255,
                             cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

        ret, labels = cv2.connectedComponents(mask)

        # The first component is the background, so we skip it
        for l in range(1, ret):
            cardmask = np.zeros(labels.shape, dtype=np.uint8)
            cardmask[labels == l] = 1

            if np.sum(cardmask) > 2000:
                bbox = bounding_box(cardmask)
                classes.append(label)
                masks.append(cardmask)
                bboxes.append(bbox)
            else:
                logging.info(
                    "%s: object %s discarded, item too small. Size %d",
                    example, label, np.sum(cardmask))

    #height = image.shape[1] # Image height
    #width = image.shape[0] # Image width
    width, height = image.size

    filename = example
    encoded_image_data = encoded_jpg  # Encoded image bytes
    image_format = 'jpeg'  # b'jpeg' or b'png'

    xmins = [
        bb[2] / width for bb in bboxes
    ]  # List of normalized left x coordinates in bounding box (1 per box)
    xmaxs = [bb[3] / width for bb in bboxes
             ]  # List of normalized right x coordinates in bounding box
    # (1 per box)
    ymins = [
        bb[0] / width for bb in bboxes
    ]  # List of normalized top y coordinates in bounding box (1 per box)
    ymaxs = [bb[1] / width for bb in bboxes
             ]  # List of normalized bottom y coordinates in bounding box
    # (1 per box)
    classes_text = map(
        lambda x: x.encode('utf8'),
        classes)  # List of string class name of bounding box (1 per box)
    classes = list(
        map(lambda x: label_map_dict[x],
            classes))  # List of integer class id of bounding box (1 per box)

    encoded_mask_png_list = []
    for mask in masks:
        img = PIL.Image.fromarray(mask)
        output = io.BytesIO()
        img.save(output, format='PNG')
        encoded_mask_png_list.append(output.getvalue())

    tf_example = tf.train.Example(features=tf.train.Features(
        feature={
            'image/height':
            dataset_util.int64_feature(height),
            'image/width':
            dataset_util.int64_feature(width),
            'image/filename':
            dataset_util.bytes_feature(filename.encode('utf8')),
            'image/source_id':
            dataset_util.bytes_feature(filename.encode('utf8')),
            'image/encoded':
            dataset_util.bytes_feature(encoded_image_data),
            'image/key/sha256':
            dataset_util.bytes_feature(key.encode('utf8')),
            'image/format':
            dataset_util.bytes_feature(image_format.encode('utf8')),
            'image/object/bbox/xmin':
            dataset_util.float_list_feature(xmins),
            'image/object/bbox/xmax':
            dataset_util.float_list_feature(xmaxs),
            'image/object/bbox/ymin':
            dataset_util.float_list_feature(ymins),
            'image/object/bbox/ymax':
            dataset_util.float_list_feature(ymaxs),
            'image/object/class/text':
            dataset_util.bytes_list_feature(classes_text),
            'image/object/class/label':
            dataset_util.int64_list_feature(classes),
            'image/object/mask':
            dataset_util.bytes_list_feature(encoded_mask_png_list)
        }))
    return tf_example
Пример #24
0
import random
import functools
import six

import numpy as np
import sklearn.metrics

from graph_nets import utils_tf
from graph_nets import utils_np
import sonnet as snt

from types import SimpleNamespace
import tensorflow as tf
from tensorflow.compat.v1 import logging

logging.info("TF Version:{}".format(tf.__version__))
import horovod.tensorflow as hvd

from root_gnn import model as all_models
from root_gnn.src.datasets import topreco
from root_gnn.src.datasets import graph
from root_gnn.utils import load_yaml

target_scales = np.array(
    [145.34593924, 145.57711889, 432.92148524, 281.44161905, 1, 1] *
    topreco.n_max_tops).T.reshape((-1, ))
target_mean = np.array(
    [6.74674671e-02, -6.17142186e-02, 4.18239305e-01, 4.24881531e+02, 0, 0] *
    topreco.n_max_tops).T.reshape((-1, ))

Пример #25
0
def train_and_evaluate(args):
    dist = init_workers(args.distributed)

    device = 'CPU'
    gpus = tf.config.experimental.list_physical_devices("GPU")
    logging.info("found {} GPUs".format(len(gpus)))

    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    if len(gpus) > 0:
        device = "{}GPUs".format(len(gpus))
    if gpus and args.distributed:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    output_dir = args.output_dir
    if dist.rank == 0:
        os.makedirs(output_dir, exist_ok=True)
    logging.info("Checkpoints and models saved at {}".format(output_dir))

    num_processing_steps_tr = args.num_iters  ## level of message-passing
    n_epochs = args.max_epochs
    logging.info("{} epochs with batch size {}".format(n_epochs,
                                                       args.batch_size))
    logging.info(
        "{} processing steps in the model".format(num_processing_steps_tr))
    logging.info("I am in hvd rank: {} of  total {} ranks".format(
        dist.rank, dist.size))

    if dist.rank == 0:
        train_input_dir = os.path.join(args.input_dir, 'train')
        val_input_dir = os.path.join(args.input_dir, 'val')
        train_files = tf.io.gfile.glob(
            os.path.join(train_input_dir, args.patterns))
        eval_files = tf.io.gfile.glob(
            os.path.join(val_input_dir, args.patterns))
        ## split the number of files evenly to all ranks
        train_files = [
            x.tolist() for x in np.array_split(train_files, dist.size)
        ]
        eval_files = [
            x.tolist() for x in np.array_split(eval_files, dist.size)
        ]
    else:
        train_files = None
        eval_files = None

    if args.distributed:
        train_files = dist.comm.scatter(train_files, root=0)
        eval_files = dist.comm.scatter(eval_files, root=0)
    else:
        train_files = train_files[0]
        eval_files = eval_files[0]

    logging.info(
        "rank {} has {} training files and {} evaluation files".format(
            dist.rank, len(train_files), len(eval_files)))

    AUTO = tf.data.experimental.AUTOTUNE
    training_dataset, ngraphs_train = read_dataset(train_files)
    training_dataset = training_dataset.prefetch(AUTO)

    input_signature = get_input_signature(training_dataset, args.batch_size)

    learning_rate = args.learning_rate
    optimizer = snt.optimizers.Adam(learning_rate)
    model = getattr(all_models, 'FourTopPredictor')()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    ckpt_manager = tf.train.CheckpointManager(checkpoint,
                                              directory=output_dir,
                                              max_to_keep=5,
                                              keep_checkpoint_every_n_hours=8)
    logging.info("Loading latest checkpoint from: {}".format(output_dir))
    _ = checkpoint.restore(ckpt_manager.latest_checkpoint)

    target_scales = np.array(
        [145.34593924, 145.57711889, 432.92148524, 281.44161905, 1, 1] *
        topreco.n_max_tops).reshape((topreco.n_max_tops, -1)).T.reshape((-1, ))
    target_mean = np.array([
        6.74674671e-02, -6.17142186e-02, 4.18239305e-01, 4.24881531e+02, 0, 0
    ] * topreco.n_max_tops).reshape((topreco.n_max_tops, -1)).T.reshape((-1, ))

    # training loss
    def loss_fcn(target_op, output_ops):
        # print("target size: ", target_op.nodes.shape)
        # print("output size: ", output_ops[0].nodes.shape)
        # output_op = output_ops[-1]
        # print("loss of 4-vect: ", tf.nn.l2_loss((target_op.nodes[:, :4] - output_op.nodes[:topreco.n_max_tops, :4])))
        # print("loss of charge: ", tf.math.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(tf.cast(target_op.nodes[:, 4:6], tf.int32),  output_op.nodes[:topreco.n_max_tops, 4:6])))
        # print("loss of predictions: ", tf.compat.v1.losses.log_loss(tf.cast(target_op.nodes[:, 6], tf.int32),  tf.math.sigmoid(output_op.nodes[:topreco.n_max_tops, 6])))

        # loss_ops = [tf.nn.l2_loss((target_op.nodes[:, :4] - output_op.nodes[:topreco.n_max_tops, :4]))
        #     + tf.math.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(tf.cast(target_op.nodes[:, 4:6], tf.int32),  output_op.nodes[:topreco.n_max_tops, 4:6]))
        #     + tf.compat.v1.losses.log_loss(tf.cast(target_op.nodes[:, 6], tf.int32),  tf.math.sigmoid(output_op.nodes[:topreco.n_max_tops, 6]))
        #     for output_op in output_ops
        # ]

        # loss_ops = [tf.nn.l2_loss((target_op.globals[:, :topreco.n_max_tops*4] - output_op.globals[:, :topreco.n_max_tops*4])) / target_op.globals.shape[0]
        #     + tf.compat.v1.losses.log_loss(
        #         tf.cast(target_op.globals[:, topreco.n_max_tops*4:topreco.n_max_tops*5], tf.int32),\
        #         tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*4:topreco.n_max_tops*5]))
        #     + tf.compat.v1.losses.log_loss(
        #         tf.cast(target_op.globals[:, topreco.n_max_tops*5:], tf.int32),\
        #         tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*5:]))
        #     for output_op in output_ops
        # ]
        # alpha = tf.constant(1, dtype=tf.float32)
        # loss_ops = [alpha * tf.compat.v1.losses.mean_squared_error(target_op.globals[:, :topreco.n_max_tops*4], output_op.globals[:, :topreco.n_max_tops*4])
        #     + tf.compat.v1.losses.log_loss(
        #         tf.cast(target_op.globals[:, topreco.n_max_tops*4:], tf.int32),\
        #         tf.math.sigmoid(output_op.globals[:, topreco.n_max_tops*4:]))
        #     for output_op in output_ops
        # ]

        # loss_ops = [ tf.nn.l2_loss((target_op.globals[:, :topreco.n_max_tops*4] - output_op.globals[:, :topreco.n_max_tops*4]))
        #     for output_op in output_ops
        # ]
        loss_ops = [ tf.compat.v1.losses.absolute_difference(
                            target_op.globals[:, :topreco.n_max_tops*4],\
                            output_op.globals[:, :topreco.n_max_tops*4])
            for output_op in output_ops
        ]

        # loss_ops = [tf.compat.v1.losses.mean_squared_error(target_op.globals[:, :topreco.n_max_tops*4], output_op.globals[:, :topreco.n_max_tops*4])
        #     for output_op in output_ops
        # ]

        return tf.stack(loss_ops)

    @functools.partial(tf.function, input_signature=input_signature)
    def train_step(inputs_tr, targets_tr, first_batch):
        print("Tracing update_step")
        print("inputs nodes", inputs_tr.nodes.shape)
        print("inputs edges", inputs_tr.edges.shape)
        print("input n_node", inputs_tr.n_node.shape)
        print(inputs_tr.nodes)
        with tf.GradientTape() as tape:
            outputs_tr = model(inputs_tr,
                               num_processing_steps_tr,
                               is_training=True)
            loss_ops_tr = loss_fcn(targets_tr, outputs_tr)
            loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant(
                num_processing_steps_tr, dtype=tf.float32)

        # Horovod: add Horovod Distributed GradientTape.
        if args.distributed:
            tape = hvd.DistributedGradientTape(tape)

        gradients = tape.gradient(loss_op_tr, model.trainable_variables)
        optimizer.apply(gradients, model.trainable_variables)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        #
        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if args.distributed and first_batch:
            hvd.broadcast_variables(model.trainable_variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables, root_rank=0)

        return loss_op_tr

    def train_epoch(dataset):
        total_loss = 0.
        batch = 0
        for inputs in tqdm(loop_dataset(dataset, args.batch_size)):
            input_tr, targets_tr = inputs
            new_target = (targets_tr.globals - target_mean) / target_scales
            targets_tr = targets_tr.replace(globals=new_target)
            total_loss += train_step(input_tr, targets_tr, batch == 0).numpy()
            batch += 1
        logging.info("total batches: {}".format(batch))
        return total_loss / batch, batch
        # return total_loss/batch/args.batch_size, batch

    out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S',
                                                time.localtime())
    out_str += '\n'
    out_str += "Epoch, Time [mins], Loss\n"
    log_name = os.path.join(output_dir, "training_log.txt")
    if dist.rank == 0:
        with open(log_name, 'a') as f:
            f.write(out_str)
    now = time.time()

    for epoch in range(n_epochs):
        logging.info("start epoch {} on {}".format(epoch, device))

        # shuffle the dataset before training
        training_dataset = training_dataset.shuffle(
            args.shuffle_size, seed=12345, reshuffle_each_iteration=True)
        loss, batches = train_epoch(training_dataset)
        this_epoch = time.time()

        logging.info(
            "{} epoch takes {:.2f} mins with loss {:.4f} in {} batches".format(
                epoch, (this_epoch - now) / 60., loss, batches))
        out_str = "{}, {:.2f}, {:.4f}\n".format(epoch,
                                                (this_epoch - now) / 60., loss)

        now = this_epoch
        if dist.rank == 0:
            with open(log_name, 'a') as f:
                f.write(out_str)
            ckpt_manager.save()

    if dist.rank == 0:
        out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S',
                                           time.localtime()) + "\n"
        with open(log_name, 'a') as f:
            f.write(out_log)
Пример #26
0
def train_and_evaluate(args):
    dist = init_workers(args.distributed)

    device = 'CPU'
    global_batch_size = 1
    gpus = tf.config.experimental.list_physical_devices("GPU")
    logging.info("found {} GPUs".format(len(gpus)))

    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    if len(gpus) > 0:
        device = "{}GPUs".format(len(gpus))
    if gpus and args.distributed:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    output_dir = utils_dir.gnn_models if args.output_dir is None else args.output_dir
    if dist.rank == 0:
        os.makedirs(output_dir, exist_ok=True)
    logging.info("Checkpoints and models saved at {}".format(output_dir))

    num_processing_steps_tr = args.num_iters  ## level of message-passing
    n_epochs = args.max_epochs
    logging.info("{} epochs with batch size {}".format(n_epochs,
                                                       global_batch_size))
    logging.info(
        "{} processing steps in the model".format(num_processing_steps_tr))
    logging.info("I am in hvd rank: {} of  total {} ranks".format(
        dist.rank, dist.size))

    if dist.rank == 0:
        train_input_dir = os.path.join(
            utils_dir.gnn_inputs,
            'train') if args.train_files is None else args.train_files
        val_input_dir = os.path.join(
            utils_dir.gnn_inputs,
            'val') if args.val_files is None else args.val_files
        train_files = tf.io.gfile.glob(os.path.join(train_input_dir, "*"))
        eval_files = tf.io.gfile.glob(os.path.join(val_input_dir, "*"))
        ## split the number of files evenly to all ranks
        train_files = [
            x.tolist() for x in np.array_split(train_files, dist.size)
        ]
        eval_files = [
            x.tolist() for x in np.array_split(eval_files, dist.size)
        ]
    else:
        train_files = None
        eval_files = None

    if args.distributed:
        train_files = dist.comm.scatter(train_files, root=0)
        eval_files = dist.comm.scatter(eval_files, root=0)
    else:
        train_files = train_files[0]
        eval_files = eval_files[0]

    logging.info(
        "rank {} has {} training files and {} evaluation files".format(
            dist.rank, len(train_files), len(eval_files)))

    raw_dataset = tf.data.TFRecordDataset(train_files)
    training_dataset = raw_dataset.map(graph.parse_tfrec_function)

    AUTO = tf.data.experimental.AUTOTUNE
    training_dataset = training_dataset.prefetch(AUTO)

    with_batch_dim = False
    inputs, targets = next(training_dataset.take(1).as_numpy_iterator())
    input_signature = (graph.specs_from_graphs_tuple(inputs, with_batch_dim),
                       graph.specs_from_graphs_tuple(targets, with_batch_dim),
                       tf.TensorSpec(shape=[], dtype=tf.bool))

    learning_rate = args.learning_rate
    optimizer = snt.optimizers.Adam(learning_rate)
    # optimizer = tf.optimizers.Adam(learning_rate)
    model = SegmentClassifier()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    ckpt_manager = tf.train.CheckpointManager(checkpoint,
                                              directory=output_dir,
                                              max_to_keep=5,
                                              keep_checkpoint_every_n_hours=8)
    logging.info("Loading latest checkpoint from: {}".format(output_dir))
    status = checkpoint.restore(ckpt_manager.latest_checkpoint)

    # training loss
    real_weight = args.real_edge_weight
    fake_weight = args.fake_edge_weight

    def create_loss_ops(target_op, output_ops):
        weights = target_op.edges * real_weight + (
            1 - target_op.edges) * fake_weight
        loss_ops = [
            tf.compat.v1.losses.log_loss(target_op.edges,
                                         tf.squeeze(output_op.edges),
                                         weights=weights)
            for output_op in output_ops
        ]
        return tf.stack(loss_ops)

    @functools.partial(tf.function, input_signature=input_signature)
    def train_step(inputs_tr, targets_tr, first_batch):
        print("Tracing update_step")
        print("inputs nodes", inputs_tr.nodes.shape)
        print("inputs edges", inputs_tr.edges.shape)
        print("input n_node", inputs_tr.n_node.shape)
        print(inputs_tr.nodes)
        with tf.GradientTape() as tape:
            outputs_tr = model(inputs_tr, num_processing_steps_tr)
            loss_ops_tr = create_loss_ops(targets_tr, outputs_tr)
            loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant(
                num_processing_steps_tr, dtype=tf.float32)

        # Horovod: add Horovod Distributed GradientTape.
        if args.distributed:
            tape = hvd.DistributedGradientTape(tape)

        gradients = tape.gradient(loss_op_tr, model.trainable_variables)
        # optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        optimizer.apply(gradients, model.trainable_variables)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        #
        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if args.distributed and first_batch:
            hvd.broadcast_variables(model.trainable_variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables, root_rank=0)

        return loss_op_tr

    def train_epoch(dataset):
        total_loss = 0.
        num_batches = 0
        for batch, inputs in enumerate(dataset):
            input_tr, target_tr = inputs
            total_loss += train_step(input_tr, target_tr, batch == 0)
            num_batches += 1
        logging.info("total batches: {}".format(num_batches))
        return total_loss / num_batches

    out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S',
                                                time.localtime())
    out_str += '\n'
    out_str += "Epoch, Time [mins], Loss\n"
    log_name = os.path.join(output_dir, "training_log.txt")
    if dist.rank == 0:
        with open(log_name, 'a') as f:
            f.write(out_str)
    now = time.time()

    for epoch in range(n_epochs):
        logging.info("start epoch {} on {}".format(epoch, device))

        loss = train_epoch(training_dataset)
        this_epoch = time.time()

        logging.info("Training {} epoch, {:.2f} mins, Loss := {:.4f}".format(
            epoch, (this_epoch - now) / 60., loss / global_batch_size))
        out_str = "{}, {:.2f}, {:.4f}\n".format(epoch,
                                                (this_epoch - now) / 60.,
                                                loss / global_batch_size)

        now = this_epoch
        if dist.rank == 0:
            with open(log_name, 'a') as f:
                f.write(out_str)
            ckpt_manager.save()

    if dist.rank == 0:
        out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S',
                                           time.localtime()) + "\n"
        with open(log_name, 'a') as f:
            f.write(out_log)
Пример #27
0
    def train(self, train_truth, epochs, batch_size, test_truth, log_dir, evaluate_samples_fn,
        train_in=None, test_in=None):
        # ======================================
        # construct testing data once for all
        # ======================================
        AUTO = tf.data.experimental.AUTOTUNE
        noise = np.random.normal(loc=0., scale=1., size=(test_truth.shape[0], self.noise_dim))
        test_in = np.concatenate(
            [test_in, noise], axis=1).astype(np.float32) if test_in is not None else noise


        testing_data = tf.data.Dataset.from_tensor_slices(
            (test_in, test_truth)).batch(batch_size, drop_remainder=True).prefetch(AUTO)

        # ====================================
        # Checkpoints and model summary
        # ====================================
        checkpoint_dir = os.path.join(log_dir, "checkpoints")
        checkpoint = tf.train.Checkpoint(
            generator=self.generator,
            discriminator=self.discriminator)
        ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=None)
        logging.info("Loading latest checkpoint from: {}".format(checkpoint_dir))
        _ = checkpoint.restore(ckpt_manager.latest_checkpoint).expect_partial()

        summary_dir = os.path.join(log_dir, "logs")
        summary_writer = tf.summary.create_file_writer(summary_dir)

        img_dir = os.path.join(log_dir, 'img')
        os.makedirs(img_dir, exist_ok=True)

        @tf.function
        def train_step(gen_in_4vec, cond_in, truth_4vec):
            with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
                gen_out_4vec = self.generator(gen_in_4vec, training=True)

                # =============================================================    
                # add the conditional inputs to generated and truth information
                # =============================================================
                gen_out_4vec = tf.concat([cond_in, gen_out_4vec], axis=-1)
                truth_4vec = tf.concat([cond_in, truth_4vec], axis=-1)

                # apply discriminator
                real_output = self.discriminator(truth_4vec, training=True)
                fake_output = self.discriminator(gen_out_4vec, training=True)

                gen_loss = generator_loss(fake_output)
                disc_loss = discriminator_loss(real_output, fake_output)

            gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
            gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

            self.generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
            self.discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))

            return disc_loss, gen_loss

        best_wdis = 9999
        best_epoch = -1
        with tqdm.trange(epochs, disable=self.disable_tqdm) as t0:
            for epoch in t0:

                # compose the training dataset by generating different noises for each epochs
                noise = np.random.normal(loc=0., scale=1., size=(train_truth.shape[0], self.noise_dim))
                train_inputs = np.concatenate(
                    [train_in, noise], axis=1).astype(np.float32) if train_in is not None else noise


                dataset = tf.data.Dataset.from_tensor_slices(
                    (train_inputs, train_in, train_truth)
                    ).shuffle(2*batch_size).batch(batch_size, drop_remainder=True).prefetch(AUTO)

                tot_loss = []
                for data_batch in dataset:
                    tot_loss.append(list(train_step(*data_batch)))

                tot_loss = np.array(tot_loss)
                avg_loss = np.sum(tot_loss, axis=0)/tot_loss.shape[0]
                loss_dict = dict(D_loss=avg_loss[0], G_loss=avg_loss[1])

                tot_wdis = evaluate_samples_fn(self.generator, epoch, testing_data, summary_writer, img_dir, **loss_dict)
                if tot_wdis < best_wdis:
                    ckpt_manager.save()
                    self.generator.save("generator")
                    best_wdis = tot_wdis
                    best_epoch = epoch
                t0.set_postfix(**loss_dict, BestD=best_wdis, BestE=best_epoch)
        tmp_res = "Best Model in {} Epoch with a Wasserstein distance {:.4f}".format(best_epoch, best_wdis)
        logging.info(tmp_res)
        summary_logfile = os.path.join(summary_dir, 'results.txt')
        with open(summary_logfile, 'a') as f:
            f.write(tmp_res + "\n")
Пример #28
0
def train_and_evaluate(args):
    for key, value in vars(args).items():
        print("{} --> {}".format(key, value))
    use_tpu = args.tpu is not None

    device = 'CPU'
    global_batch_size = args.train_batch_size if not use_tpu else args.tpu_cores
    physical_gpus = tf.config.experimental.list_physical_devices("GPU")
    n_gpus = len(physical_gpus)

    if use_tpu:
        if args.tpu == 'colab':
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        else:
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                tpu=args.tpu, zone=args.zone)
        workers = resolver.cluster_spec().as_dict()['worker']
        n_tpus = len(workers)
        logging.info('Running on {} TPUs '.format(n_tpus))
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = snt.distribute.TpuReplicator(resolver)
        device = 'TPU'
    elif n_gpus > 1:
        logging.info("Useing SNT Replicator with {} GPUs".format(n_gpus))
        strategy = snt.distribute.Replicator(['/device:GPU:{}'.format(i) for i in range(n_gpus)],\
            tf.distribute.ReductionToOneDevice("CPU:0"))
        device = "{}GPUs".format(n_gpus)
        for dd in physical_gpus:
            tf.config.experimental.set_memory_growth(dd, True)
    elif n_gpus > 0:
        strategy = tf.distribute.OneDeviceStrategy("/device:GPU:0")
        device = "1GPU"
    else:
        strategy = tf.distribute.OneDeviceStrategy("/device:CPU:0")

    if n_gpus > 0:
        assert n_gpus == global_batch_size, "batch size {} does not equall to GPUs {}".format(
            global_batch_size, n_gpus)
    else:
        pass
        # assert global_batch_size == 1, "batch size {} does not equall to 1".format(global_batch_size)

    output_dir = args.job_dir
    if not use_tpu:
        os.makedirs(output_dir, exist_ok=True)
    logging.info("Checkpoints and models saved at {}".format(output_dir))

    num_processing_steps_tr = args.num_iters  ## level of message-passing
    n_epochs = args.num_epochs
    logging.info("{} epochs with batch size {}".format(n_epochs,
                                                       global_batch_size))
    logging.info(
        "{} processing steps in the model".format(num_processing_steps_tr))
    # prepare graphs
    logging.info("{} Eta bins and {} Phi bins".format(args.num_eta_bins,
                                                      args.num_phi_bins))
    _, max_edges = graph.get_max_graph_size(args.num_eta_bins,
                                            args.num_phi_bins)

    # train_files = tf.io.gfile.glob(args.train_files)
    # eval_files = tf.io.gfile.glob(args.eval_files)
    file_names = tf.io.gfile.glob(args.input_files)
    n_files = len(file_names)
    n_train = int(0.9 * n_files)
    if n_train < 1: n_train = 1

    # logging.info("Input file names: ", file_names)
    logging.info("{} input files".format(n_files))
    # logging.info("{} training files and {} evaluation files".format(len(train_files), len(eval_files)))
    logging.info("{} training files".format(n_train))
    raw_dataset = tf.data.TFRecordDataset(file_names[:n_train])
    training_dataset = raw_dataset.map(graph.parse_tfrec_function)

    AUTO = tf.data.experimental.AUTOTUNE
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    training_dataset = training_dataset.with_options(options)
    training_dataset = training_dataset.batch(
        global_batch_size, drop_remainder=True).prefetch(AUTO)

    learning_rate = args.learning_rate
    with strategy.scope():
        optimizer = snt.optimizers.Adam(learning_rate)
        # model = SegmentClassifier()
        model = get_model(args.model_name)

    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    ckpt_manager = tf.train.CheckpointManager(checkpoint,
                                              directory=output_dir,
                                              max_to_keep=5)
    logging.info("Loading latest checkpoint from: {}".format(output_dir))
    _ = checkpoint.restore(ckpt_manager.latest_checkpoint)

    # training loss
    real_weight = args.real_edge_weight
    fake_weight = args.fake_edge_weight

    def log_loss(label, prediction, eps=1e-7, weights=1.):
        # tf.compat.v1.losses.log_loss, not supported by TPU
        # copy the TF source code here
        loss = tf.negative(
            tf.add(tf.multiply(label, tf.math.log(prediction + eps)),
                   tf.multiply((1 - label),
                               tf.math.log(1 - prediction + eps))))
        loss = tf.multiply(loss, weights)
        present = tf.where(tf.math.equal(weights, 0.0), tf.zeros_like(weights),
                           tf.ones_like(weights))
        loss = tf.math.divide_no_nan(tf.math.reduce_sum(loss),
                                     tf.math.reduce_sum(present))
        return loss

    def create_loss_ops(target_op, output_ops):
        t_edges = tf.squeeze(target_op.edges)
        weights = t_edges * real_weight + (1 - t_edges) * fake_weight
        row_index = tf.range(tf.constant(max_edges))
        n_valid_edges = target_op.n_edge[0]

        # # NOTE: this implementation is very low
        # # cond = (row_index < n_valid_edges)
        # # zeros = tf.zeros_like(weights, dtype=weights.dtype)
        # # weights = tf.where(cond, weights, zeros)

        mask = tf.cast(row_index < n_valid_edges, tf.float32)
        # mask = tf.expand_dims(mask, axis=1)
        weights = weights * mask

        loss_ops = [
            tf.compat.v1.losses.log_loss(t_edges,
                                         tf.squeeze(output_op.edges),
                                         weights=weights)
            for output_op in output_ops
        ]

        return tf.stack(loss_ops)

    # if use_tpu:
    #     cast_down_tensor = functools.partial(tf.cast, dtype=tf.float16)
    # else:
    #     cast_down_tensor = functools.partial(tf.cast, dtype=tf.bfloat16)

    @tf.function(autograph=False)
    def train_step(inputs_tr, targets_tr):
        print("Tracing train_step")
        print(inputs_tr)

        def update_step(inputs_tr, targets_tr):
            # logging.info("Tracing update_step")
            # logging.info("before contatenate:", inputs_tr.n_node.shape)
            inputs_tr = graph.concat_batch_dim(inputs_tr)
            targets_tr = graph.concat_batch_dim(targets_tr)
            # if args.mix_precision:
            #     inputs_tr = inputs_tr.map(cast_down_tensor)
            #     targets_tr = targets.map(cast_down_tensor)

            # logging.info("after concatenate:", inputs_tr.n_node.shape)

            with tf.GradientTape() as tape:
                outputs_tr = model(inputs_tr, num_processing_steps_tr)
                loss_ops_tr = create_loss_ops(targets_tr, outputs_tr)
                loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant(
                    num_processing_steps_tr, dtype=tf.float32)

            gradients = tape.gradient(loss_op_tr, model.trainable_variables)
            # aggregate the gradients from the full batch.
            # this is not there for mirror strategy
            replica_ctx = tf.distribute.get_replica_context()
            gradients = replica_ctx.all_reduce("mean", gradients)

            optimizer.apply(gradients, model.trainable_variables)
            return loss_op_tr

        per_example_losses = strategy.run(update_step,
                                          args=(inputs_tr, targets_tr))
        mean_loss = strategy.reduce("sum", per_example_losses, axis=None)
        return mean_loss

    def train_epoch(dataset):
        total_loss = 0.
        num_batches = 0
        for inputs in dataset:
            input_tr, target_tr = inputs

            total_loss += train_step(input_tr, target_tr)
            num_batches += 1
        logging.info("total batches: {}".format(num_batches))
        return total_loss / num_batches

    # this_time =  time.strftime('%d %b %Y %H:%M:%S', time.localtime())
    out_str = "Start training " + time.strftime('%d %b %Y %H:%M:%S',
                                                time.localtime())
    out_str += '\n'
    out_str += "Epoch, Time [mins], Loss\n"
    log_name = os.path.join(output_dir, "training_log.txt")
    print(out_str)
    # with open(log_name, 'a') as f:
    #     f.write(out_str)
    now = time.time()
    # writer = tf.summary.create_file_writer(os.path.join(output_dir, this_time))

    dist_training_dataset = strategy.experimental_distribute_dataset(
        training_dataset)
    for epoch in range(n_epochs):
        logging.info("start epoch {} on {}".format(epoch, device))
        # training_dataset = training_dataset.shuffle(global_batch_size*2, reshuffle_each_iteration=True)

        loss = train_epoch(dist_training_dataset)
        # loss = train_epoch(training_dataset)
        this_epoch = time.time()

        logging.info("Training {} epoch, {:.2f} mins, Loss := {:.4f}".format(
            epoch, (this_epoch - now) / 60., loss / global_batch_size))
        out_str = "{}, {:.2f}, {:.4f}\n".format(epoch,
                                                (this_epoch - now) / 60.,
                                                loss / global_batch_size)
        # with open(log_name, 'a') as f:
        #     f.write(out_str)
        # with writer.as_default():
        #     tf.sumary.scalar("")

        now = this_epoch
        ckpt_manager.save()

    out_log = "End @ " + time.strftime('%d %b %Y %H:%M:%S',
                                       time.localtime()) + "\n"
    print(out_log)
Пример #29
0
def train(args):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
    # Distribution Strategy
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    # TODO Implement on multi-nodes SLURM
    strategy = tf.distribute.MirroredStrategy()
    session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True  # Allow full memory usage of GPU.
    warm_start = None
    if args.load_model:  # Load model
        model_path = paths['save'] + '/' + args.load_model
        eval_path = model_path + '/eval'
    else:
        trial = 0
        while os.path.exists(paths['save'] +
                             '/{}_trial_{}'.format(args.modality, trial)):
            trial += 1
        model_path = paths['save'] + '/{}_trial_{}'.format(
            args.modality, trial)
        eval_path = model_path + '/eval'

    train_input_fn = DatasetHandler('train', args)
    eval_input_fn = DatasetHandler('eval', args)

    train_size = len(train_input_fn)
    eval_size = len(eval_input_fn)
    if args.mode == 'test':
        train_size = 20
        eval_size = 10
    steps_per_epoch = np.ceil(train_size / args.batch_size)
    max_training_steps = args.epochs * steps_per_epoch

    model_fn_params = {
        'batch_norm':
        args.no_bn,
        'dropout':
        args.dropout,
        'classes':
        args.classes,
        'lr':
        args.lr,
        'decay_rate':
        args.decay_rate,
        'decay_steps':
        np.ceil(args.epochs * steps_per_epoch / (args.decays_per_train + 1)),
        'eval_path':
        eval_path,
        'eval_steps':
        eval_size
    }

    configuration = tf.estimator.RunConfig(
        tf_random_seed=args.seed,
        save_summary_steps=steps_per_epoch,
        keep_checkpoint_max=args.early_stop + 2,
        save_checkpoints_steps=steps_per_epoch,
        log_step_count_steps=np.ceil(steps_per_epoch / 2),
        train_distribute=strategy,
        session_config=session_config)
    liver_seg = tf.estimator.Estimator(model_fn=unet_model_fn,
                                       model_dir=model_path,
                                       params=model_fn_params,
                                       config=configuration,
                                       warm_start_from=warm_start)

    es_steps = steps_per_epoch * args.early_stop
    early_stopping = stop_if_no_decrease_hook(
        liver_seg, metric_name='loss', max_steps_without_decrease=es_steps)
    profiler_hook = tf.estimator.ProfilerHook(save_steps=int(
        max_training_steps / 5),
                                              show_memory=True,
                                              output_dir=model_path)

    log_data = {
        'train_size': train_size,
        'steps_per_epoch': steps_per_epoch,
        'max_training_steps': max_training_steps,
        'eval_size': eval_size,
        'eval_steps': eval_size,
        'model_path': model_path
    }

    save_logs(args, log_data)
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: train_input_fn.input_fn(),
        hooks=[profiler_hook, early_stopping],
        max_steps=max_training_steps)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: eval_input_fn.input_fn(),
        steps=eval_size,
        start_delay_secs=0,
        throttle_secs=0)
    tf.estimator.train_and_evaluate(liver_seg,
                                    train_spec=train_spec,
                                    eval_spec=eval_spec)
    info(
        'Train and Evaluation Mode Finished!\n Metrics and checkpoints are saved at:'
        '\n {}\n ----------'.format(model_path))
Пример #30
0
def _log_data_augmentation(data_augmentation, name):
    """Logs the given data augmentation parameters for diagnostic purposes."""
    if not data_augmentation:
        logging.info('No data augmentation provided for %s', name)
    else:
        logging.info('%s augmentations:', name)
        logging.info('enable_jitter: %s', data_augmentation.enable_jitter)
        logging.info('jitter_amount: %d', data_augmentation.jitter_amount)
        logging.info('enable_gaussian_noise: %s',
                     data_augmentation.enable_gaussian_noise)
        logging.info('gaussian_noise_std: %s',
                     data_augmentation.gaussian_noise_std)