Пример #1
0
def main():
    tracking.init()

    for i in range(args.steps):
        logger.info('Step %s', i)
        # Scalars
        loss = get_loss(i)
        accuracy = get_accuracy(loss)
        # training metrics, but don't commit the step.
        tracking.log_metrics(step=i, loss=loss, accuracy=accuracy)
        # validation metrics, which could be reported in another part of the code
        if i % args.validate_every == 0:
            tracking.log_metric(name='val_acc', value=accuracy - 0.05, step=i)

        # Dist
        tracking.log_histogram('distribution', get_dist(i), 'auto', step=i)

        # Text
        tracking.log_text('text-ex', text=get_text(i), step=i)

        # images
        log_images(i)
        # HTML
        tracking.log_html('html-ex', html=get_html(i), step=i)

        # Generate sin wave as audio
        tracking.log_audio(data=get_audio(i), name='audio', step=i)

        time.sleep(0.25)

    plot_scatter(100)
    get_sin_plot(100)
    plot_mpl_figure(100)
    log_bokeh(100)
    log_altair(100)
    log_curves(100)
    log_plotly(100)
    log_curves(100)

    train_network()
Пример #2
0
 def experiment(self) -> tracking.Run:
     if self._experiment:
         return self._experiment
     tracking.init(
         owner=self._owner,
         project=self._project,
         run_uuid=self._run_uuid,
         client=self._client,
         track_code=self._track_code,
         track_env=self._track_env,
         refresh_data=self._refresh_data,
         artifacts_path=self._artifacts_path,
         collect_artifacts=self._collect_artifacts,
         collect_resources=self._collect_resources,
         is_offline=self._is_offline,
         is_new=self._is_new,
         name=self._name,
         description=self._description,
         tags=self._tags,
     )
     self._experiment = tracking.TRACKING_RUN
     return self._experiment
Пример #3
0
        default='adam'
    )
    parser.add_argument(
        '--log_learning_rate',
        type=int,
        default=-3
    )
    parser.add_argument(
        '--epochs',
        type=int,
        default=1
    )
    args = parser.parse_args()

    # Polyaxon
    tracking.init()

    logger.info('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=args.max_features,
                                                          skip_top=args.skip_top)

    logger.info('train sequences %s', len(x_train))
    logger.info('test sequences %s', len(x_test))

    # Polyaxon
    tracking.log_data_ref(content=x_train, name='x_train')
    tracking.log_data_ref(content=y_train, name='y_train')
    tracking.log_data_ref(content=x_test, name='x_test')
    tracking.log_data_ref(content=y_test, name='y_test')

    logger.info('Transforming data...')
Пример #4
0
def main():
    """ Runs dataLayer processing scripts to turn raw dataLayer from (../raw) into
        cleaned dataLayer ready to be analyzed (saved in ../processed).
    """
    ## Talk to Rune about how dataLayer is handle.
    config = TrainingConfig()
    # config = update_config(args,config)
    ## For polyaxon

    config.epochs = 501
    config.run_polyaxon = True
    config.batch_size = 8
    config.lr = 0.0002
    config.save_model_step = 100
    config.n_critic = 2
    config.model_name = 'PartialConvolutionsWgan'

    # Test parametre vi kører med, som normalt sættes i experiments
    if config.run_polyaxon:
        # The POLYAXON_NO_OP env variable had to be set before any Polyaxon imports were allowed to happen
        from polyaxon import tracking
        tracking.init()
        input_root_path = Path(
            r'/data/inpainting/data_landset8/Test_dataset/Betaset')
        cache_path = Path('/cache')
        output_root_path = Path(tracking.get_outputs_path())
        pathToData = input_root_path  ## Delete later HACK
        inpainting_data_path = input_root_path / 'inpainting'
        # Set PyTorch to use the data directory for caching pre-trained models. If this is not done, each experiment
        # will download the pre-trained model and store it in each individual experiment container, thereby wasting
        # large amounts of disk space.
        # Code is from here: https://stackoverflow.com/a/52784628
        os.environ['TORCH_HOME'] = str(
            cache_path / 'pytorch_cache')  # setting the environment variable

        config.output_path = Path(os.getcwd()).joinpath('outputs')
        config.data_path = Path(r'/data/inpainting/')
        config.polyaxon_tracking = tracking
    if not config.run_polyaxon:
        os.environ['POLYAXON_NO_OP'] = 'true'
    # Setup Polyaxon (import must be done here as the POLYAXON_NO_OP variable was set inside Python)

    beta_test_path_list = glob(str(pathToData) + "/*/")

    # S1A_20201005_034656_DSC_109_RGBsar_cog.tif
    # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B02_cog
    # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B03_cog.tif
    # S2B_MSIL2A_20201002T090719_N0214_R050_T35TMH_20201002T113443_B04_cog.tif

    logger = logging.getLogger(__name__)
    logger.info('making final dataLayer set from raw dataLayer')

    logger.info(pathToData)

    ImageDict = get_dataset(beta_test_path_list, batch_size=config.batch_size)
    train = ImageDict['train_dataloader']
    test = ImageDict['test_dataloader']

    # Kører på WGAN GP
    if config.model_name == 'PartialConvolutions':
        curtraingModel = trainInpaintingWgan(train, test, generator,
                                             criticWgan, config)
        local_model_path = curtraingModel.trainGAN()
    elif config.model_name == 'PartialConvolutionsWgan':
        curtraingModel = trainInpaintingWgan(train, test, generator,
                                             criticWgan, config)
        local_model_path = curtraingModel.trainGAN()

    # local_model_path = Path(r"C:\Users\panda\PycharmProjects\Image_Inpainting_Sat\Master_Satelite_Image_Inpainting\OutputModels\PartialConvolutionsWgan_200.pt")
    if config.run_polyaxon:
        model_path = inpainting_data_path / 'models'
        modelOutputPath = Path.joinpath(model_path, 'OutputModels')
        stores_output_path = config.output_path / 'data' / 'storedData'
    else:
        localdir = Path().absolute().parent
        modelOutputPath = Path.joinpath(localdir, 'OutputModels')
        stores_output_path = localdir / 'data' / 'storedData'

    curevalModel = eval_model(config)
    curevalModel.run_eval(modelOutputPath,
                          stores_output_path,
                          model_path=local_model_path,
                          test_dataloader=test)
Пример #5
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    lr_scaler = hvd.size()
    # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
    # scale lr by local_size
    if args.use_adasum:
        lr_scaler = hvd.local_size() if hvd.nccl_built() else 1

    # Horovod: adjust learning rate based on lr_scaler.
    opt = tf.train.AdamOptimizer(0.001 * lr_scaler)

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(
        opt, op=hvd.Adasum if args.use_adasum else hvd.Average)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = None
    if hvd.rank() == 0:
        tracking.init()
        checkpoint_dir = tracking.get_outputs_path()
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})