Пример #1
0
    def __init__(self, debug_model_input=None):
        super(KerasCallback, self).__init__()

        self.experiment = deepkit.experiment()

        self.debug_model_input = debug_model_input

        self.data_validation = None
        self.data_validation_size = None

        self.current = {}
        self.last_batch_time = time.time()
        self.start_time = time.time()
        self.accuracy_metric = None
        self.all_losses = None
        self.loss_metric = None
        self.learning_rate_metric = None
        self.learning_rate_start = 0
Пример #2
0
#os.environ["PLAIDML_NATIVE_PATH"] = "/usr/local/lib/libplaidml.dylib"

import keras
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, Flatten
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras.models import Model
from keras.datasets import cifar10
import numpy as np
import deepkit

experiment = deepkit.experiment()

experiment.add_label('resnet', 'keras')

# Training parameters
batch_size = 128  # orig paper trained all networks with batch_size=128
epochs = 200
data_augmentation = False
num_classes = 10

# Subtracting pixel mean improves accuracy
subtract_pixel_mean = True

# Model parameter
# ----------------------------------------------------------------------------
#           |      | 200-epoch | Orig Paper| 200-epoch | Orig Paper| sec/epoch
Пример #3
0
def train(args, cfg) -> Learner:
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()
        if args.gpu is None:
            args.gpu = list(range(n_gpu))[0]
        torch.cuda.set_device(args.gpu)
    else:
        n_gpu = None
        args.gpu = -1

    use_deepkit = args.deepkit and rank_distrib() == 0
    if use_deepkit:
        experiment = deepkit.experiment()

        for key, val in vars(args).items():
            set_config(experiment, key, val)

        for key, val in dict(cfg.vocab).items():
            set_config(experiment, f"vocab.{key}", val)
        for key, val in dict(cfg.training).items():
            set_config(experiment, f"training.{key}", val)
        for key, val in dict(cfg.model).items():
            set_config(experiment, f"model.{key}", val)

        args.experiment = experiment

    run_tags = [
        cfg.model.name,
        "uncased" if cfg.vocab.lowercase else "cased",
        f"sl{cfg.training.max_seq_length}",
    ]

    model_name = "-".join(run_tags[0:3])

    args.tokenizer_path = normalize_path(args.tokenizer_path)
    args.train_path = normalize_path(args.train_path)
    args.valid_path = normalize_path(args.valid_path)

    tokenizer = load_tokenizer(cfg, args.tokenizer_path)

    model = initialize_model(cfg, args, tokenizer=tokenizer)

    dls = dataloaders(args, cfg, tokenizer=tokenizer, max_items=args.max_items)
    dls.to(default_device())

    learn = get_learner(args, cfg, dataloaders=dls, model=model, tokenizer=tokenizer, use_deepkit=use_deepkit)

    if args.fp16:
        learn = learn.to_fp16()

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    log.info(f"Model device is {model.device}, loader device is {dls[0].device}")

    if rank_distrib() == 0:
        log.info(f"Pretraining ALBERT: {args}")
        log.info(f"Configuration: {cfg.pretty()}")

        if args.max_items:
            log.info(f"Sentence pairs limited to {args.max_items}")
        else:
            log.info("Processing all sentence pairs")
        log.info(
            "GPUs: %s, 16-bits training: %s", torch.cuda.device_count(), args.fp16,
        )

    if num_distrib() > 1:
        DistributedTrainer.fup = True

    with learn.distrib_ctx(
        cuda_id=args.gpu
    ):  # distributed traing requires "-m fastai2.launch"
        log.info(f"Training in distributed data parallel context on GPU {args.gpu}")
        learn.fit_one_cycle(args.epochs, lr_max=cfg.training.learning_rate)

    learn.model.eval()

    if args.export_path:
        args.export_path = normalize_path(args.export_path)
        args.export_path.mkdir(parents=True, exist_ok=True)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.__class__ = AlbertForMaskedLM
        torch.save(model_to_save.state_dict(), args.export_path / "pytorch_model.bin")
        model_to_save.config.to_json_file(args.export_path / "config.json")
        tokenizer.save_pretrained(args.export_path)
        if use_deepkit:
            for file in args.export_path.glob("*"):
                args.experiment.add_output_file(str(file))

    return learn
Пример #4
0
# this script starts multiple experiments
import random
import threading
from time import sleep

import deepkit

experiment_optimization_id = '1'

hyper_parameters_base = {
    'lr': 0.1,
    'optimizer': 'adam',
}

root_experiment = deepkit.experiment(project='threaded')
experiments = 10


class ExperimentExecutor(threading.Thread):
    def __init__(self, id: int, root_experiment: deepkit.Experiment,
                 hyper_parameters: dict):
        super().__init__()
        self.daemon = True
        self.id = id
        self.root_experiment = root_experiment
        self.hyper_parameters = hyper_parameters

    def run(self):
        experiment = self.root_experiment.create_sub_experiment()
        experiment.set_info('id', id)
        experiment.set_info('optimization_id', experiment_optimization_id)
Пример #5
0
import deepkit
import ray
from ray.rllib.agents import dqn

# note: ray overwrites sys.path[0], Dunno why, but that breaks deepkit looking for the project link
experiment = deepkit.experiment(account='localhost',
                                project='deepkit-python-sdk')

# Initialize Ray with host that makes docker happy
ray.init(webui_host='127.0.0.1')

# Initialize DQN Trainer with default config and built-in gym cart-pole environment.
trainer = dqn.DQNTrainer(config=dqn.DEFAULT_CONFIG, env="CartPole-v0")

# Extract several layers of models
ray_policy = trainer.get_policy()
ray_model = ray_policy.model
# This is the one I think we should "watch"
keras_model = ray_model.base_model

experiment.watch_keras_model(keras_model)

experiment.log('lets go')

# Manually train for a couple of iterations
for i in range(20):
    result = trainer.train()

experiment.log('Done')
from time import sleep

import deepkit

experiment = deepkit.experiment(project='sub-experiments')
print('root job', experiment.id)

experiments = 10

for i in range(experiments):
    sub_experiment = experiment.create_sub_experiment()
    print('sub job', sub_experiment.id)

    sub_experiment.done()

sleep(5)