)
from detectron2.modeling import build_model
from detectron2.solver import build_lr_scheduler, build_optimizer
from detectron2.utils.events import (
    CommonMetricPrinter,
    EventStorage,
    JSONWriter,
    TensorboardXWriter,
)
import pytorch_warmup as warmup

import pfa

import wandb

wandb.init(project="deepscribe-detectron", sync_tensorboard=True)

logger = logging.getLogger("detectron2")


def get_evaluator(cfg, dataset_name, output_folder=None):
    """
    Create evaluator(s) for a given dataset.
    This uses the special metadata "evaluator_type" associated with each builtin dataset.
    For your own dataset, you can simply create an evaluator manually in your
    script and do not have to worry about the hacky if-else logic here.
    """
    if output_folder is None:
        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")

    return COCOEvaluator(dataset_name, cfg, True, output_folder)
Пример #2
0
import numpy as np
import pandas as pd
from utils import args_util, plmodel_util, dataloading
import argparse
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from pytorch_lightning.core.lightning import LightningModule
import pytorch_lightning as pl
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
# os.environ['CUDA_VISIBLE_DEVICES'] = "1,2"
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.utilities.distributed import rank_zero_only

import wandb
wandb.init(project="fp_lightning")


@rank_zero_only
def wandb_save(wandb_logger, config):
    wandb_logger.log_hyperparams(config)
    wandb_logger.experiment.save('./pl_fingerprint.py', policy="now")


def main():
    arg_parser = args_util.add_general_args()
    arg_parser = args_util.add_train_args(arg_parser)
    arg_parser = args_util.add_model_args(arg_parser)
    args = arg_parser.parse_args()

    #kishore update parameters
Пример #3
0
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.utils import np_utils
from keras.callbacks import Callback
import json

from wandb.keras import WandbCallback
import wandb

run = wandb.init()
config = run.config
config.optimizer = "adam"
config.epochs = 50
config.dropout = 10
config.hidden_nodes = 100

# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
img_width = X_train.shape[1]
img_height = X_train.shape[2]

X_train = X_train.astype('float32')
X_train /= 255.
X_test = X_test.astype('float32')
X_test /= 255.

# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
        default='O2',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--warmup_steps",
                        default=1000,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    args = parser.parse_args()
    if use_wandb:
        wandb.init(project='grammar', name=args.exp_name, config=args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
Пример #5
0
def main(argv):
    args = parser.parse_args()
    print('Load test starting')

    project_name = args.project
    if project_name is None:
        project_name = 'artifacts-load-test-%s' % str(datetime.now()).replace(
            ' ', '-').replace(':', '-').replace('.', '-')

    env_project = os.environ.get('WANDB_PROJECT')

    sweep_id = os.environ.get('WANDB_SWEEP_ID')
    if sweep_id:
        del os.environ['WANDB_SWEEP_ID']
    wandb_config_paths = os.environ.get('WANDB_CONFIG_PATHS')
    if wandb_config_paths:
        del os.environ['WANDB_CONFIG_PATHS']
    wandb_run_id = os.environ.get('WANDB_RUN_ID')
    if wandb_run_id:
        del os.environ['WANDB_RUN_ID']

    # set global entity and project before chdir'ing
    from wandb.apis import InternalApi
    api = InternalApi()
    settings_entity = api.settings('entity')
    settings_base_url = api.settings('base_url')
    os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY')
                                  or settings_entity)
    os.environ['WANDB_PROJECT'] = project_name
    os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL')
                                    or settings_base_url)

    # Change dir to avoid litering code directory
    pwd = os.getcwd()
    tempdir = tempfile.TemporaryDirectory()
    os.chdir(tempdir.name)

    artifact_name = 'load-artifact-' + ''.join(
        random.choices(string.ascii_lowercase + string.digits, k=10))

    print('Generating source data')
    source_file_names = gen_files(args.gen_n_files, args.gen_max_small_size,
                                  args.gen_max_large_size)
    print('Done generating source data')

    procs = []
    stop_queue = multiprocessing.Queue()
    stats_queue = multiprocessing.Queue()

    # start all processes

    # writers
    for i in range(args.num_writers):
        file_names = source_file_names
        if args.non_overlapping_writers:
            chunk_size = int(len(source_file_names) / args.num_writers)
            file_names = source_file_names[i * chunk_size:(i + 1) * chunk_size]
        p = multiprocessing.Process(
            target=proc_version_writer,
            args=(stop_queue, stats_queue, project_name, file_names,
                  artifact_name, args.files_per_version_min,
                  args.files_per_version_max))
        p.start()
        procs.append(p)

    # readers
    for i in range(args.num_readers):
        p = multiprocessing.Process(target=proc_version_reader,
                                    args=(stop_queue, stats_queue,
                                          project_name, artifact_name, i))
        p.start()
        procs.append(p)

    # deleters
    for i in range(args.num_deleters):
        p = multiprocessing.Process(
            target=proc_version_deleter,
            args=(stop_queue, stats_queue, artifact_name,
                  args.min_versions_before_delete, args.delete_period_max))
        p.start()
        procs.append(p)

    # cache garbage collector
    if args.cache_gc_period_max is None:
        print('Testing cache GC process not enabled!')
    else:
        p = multiprocessing.Process(target=proc_cache_garbage_collector,
                                    args=(stop_queue,
                                          args.cache_gc_period_max))
        p.start()
        procs.append(p)

    # reset environment
    os.environ['WANDB_ENTITY'] = settings_entity
    os.environ['WANDB_BASE_URL'] = settings_base_url
    os.environ
    if env_project is None:
        del os.environ['WANDB_PROJECT']
    else:
        os.environ['WANDB_PROJECT'] = env_project
    if sweep_id:
        os.environ['WANDB_SWEEP_ID'] = sweep_id
    if wandb_config_paths:
        os.environ['WANDB_CONFIG_PATHS'] = wandb_config_paths
    if wandb_run_id:
        os.environ['WANDB_RUN_ID'] = wandb_run_id
    # go back to original dir
    os.chdir(pwd)

    # test phase
    start_time = time.time()
    stats = defaultdict(int)

    run = wandb.init(job_type='main-test-phase')
    run.config.update(args)
    while time.time() - start_time < args.test_phase_seconds:
        stat_update = None
        try:
            stat_update = stats_queue.get(True, 5000)
        except queue.Empty:
            pass
        print('** Test time: %s' % (time.time() - start_time))
        if stat_update:
            for k, v in stat_update.items():
                stats[k] += v
        wandb.log(stats)

    print('Test phase time expired')
    # stop all processes and wait til all are done
    for i in range(len(procs)):
        stop_queue.put(True)
    print('Waiting for processes to stop')
    fail = False
    for proc in procs:
        proc.join()
        if proc.exitcode != 0:
            print('FAIL! Test phase failed')
            fail = True
            sys.exit(1)

    # drain remaining stats
    while True:
        try:
            stat_update = stats_queue.get_nowait()
        except queue.Empty:
            break
        for k, v in stat_update.items():
            stats[k] += v

    print('Stats')
    import pprint
    pprint.pprint(dict(stats))

    if fail:
        print('FAIL! Test phase failed')
        sys.exit(1)
    else:
        print('Test phase successfully completed')

    print('Starting verification phase')

    os.environ['WANDB_ENTITY'] = (os.environ.get('LOAD_TEST_ENTITY')
                                  or settings_entity)
    os.environ['WANDB_PROJECT'] = project_name
    os.environ['WANDB_BASE_URL'] = (os.environ.get('LOAD_TEST_BASE_URL')
                                    or settings_base_url)
    data_api = wandb.Api()
    # we need list artifacts by walking runs, accessing via
    # project.artifactType.artifacts only returns committed artifacts
    for run in data_api.runs('%s/%s' % (api.settings('entity'), project_name)):
        for v in run.logged_artifacts():
            # TODO: allow deleted once we build deletion support
            if v.state != 'COMMITTED' and v.state != 'DELETED':
                print('FAIL! Artifact version not committed or deleted: %s' %
                      v)
                sys.exit(1)

    print('Verification succeeded')
Пример #6
0
def main(args):
    parser = get_config()
    all_args = parse_args(args, parser)

    if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg":
        assert (all_args.use_recurrent_policy
                or all_args.use_naive_recurrent_policy), (
                    "check recurrent policy!")
    elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg":
        assert (all_args.use_recurrent_policy == False
                and all_args.use_naive_recurrent_policy
                == False), ("check recurrent policy!")
    else:
        raise NotImplementedError

    # cuda
    if all_args.cuda and torch.cuda.is_available():
        print("choose to use gpu...")
        device = torch.device("cuda:0")
        torch.set_num_threads(all_args.n_training_threads)
        if all_args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        print("choose to use cpu...")
        device = torch.device("cpu")
        torch.set_num_threads(all_args.n_training_threads)

    # run dir
    run_dir = Path(
        os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] +
        "/results"
    ) / all_args.env_name / all_args.hanabi_name / all_args.algorithm_name / all_args.experiment_name
    if not run_dir.exists():
        os.makedirs(str(run_dir))

    # wandb
    if all_args.use_wandb:
        run = wandb.init(config=all_args,
                         project=all_args.env_name,
                         entity=all_args.user_name,
                         notes=socket.gethostname(),
                         name=str(all_args.algorithm_name) + "_" +
                         str(all_args.experiment_name) + "_seed" +
                         str(all_args.seed),
                         group=all_args.hanabi_name,
                         dir=str(run_dir),
                         job_type="training",
                         reinit=True)
    else:
        if not run_dir.exists():
            curr_run = 'run1'
        else:
            exst_run_nums = [
                int(str(folder.name).split('run')[1])
                for folder in run_dir.iterdir()
                if str(folder.name).startswith('run')
            ]
            if len(exst_run_nums) == 0:
                curr_run = 'run1'
            else:
                curr_run = 'run%i' % (max(exst_run_nums) + 1)
        run_dir = run_dir / curr_run
        if not run_dir.exists():
            os.makedirs(str(run_dir))

    setproctitle.setproctitle(
        str(all_args.algorithm_name) + "-" + str(all_args.env_name) + "-" +
        str(all_args.experiment_name) + "@" + str(all_args.user_name))

    # seed
    torch.manual_seed(all_args.seed)
    torch.cuda.manual_seed_all(all_args.seed)
    np.random.seed(all_args.seed)

    # env init
    envs = make_train_env(all_args)
    eval_envs = make_eval_env(all_args) if all_args.use_eval else None
    num_agents = all_args.num_agents

    config = {
        "all_args": all_args,
        "envs": envs,
        "eval_envs": eval_envs,
        "num_agents": num_agents,
        "device": device,
        "run_dir": run_dir
    }

    # run experiments
    if all_args.share_policy:
        from onpolicy.runner.shared.hanabi_runner_backward import HanabiRunner as Runner
    else:
        from onpolicy.runner.separated.hanabi_runner_backward import HanabiRunner as Runner

    runner = Runner(config)
    runner.run()

    # post process
    envs.close()
    if all_args.use_eval and eval_envs is not envs:
        eval_envs.close()

    if all_args.use_wandb:
        run.finish()
    else:
        runner.writter.export_scalars_to_json(
            str(runner.log_dir + '/summary.json'))
        runner.writter.close()
Пример #7
0
def run_training(args):
    print('---------- Initialize W&B run for experiment tracking----------\n')
    run = wandb.init(entity=args.wandb_entity,
                     project=args.wandb_project,
                     job_type='train')
    wandb.config.update(args)

    print('---------- Perform Training ----------')

    savedir = args.savepath
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    head_tail = os.path.split(args.dataset)
    savedir = os.path.join(savedir, head_tail[1])

    if not os.path.exists(savedir):
        os.mkdir(savedir)

    if not os.path.exists(os.path.join(savedir, "trained model")):
        os.mkdir(os.path.join(savedir, "trained model"))
        print('creating directory %s' %
              (os.path.join(savedir, "trained model")))

    if not os.path.exists(os.path.join(savedir, "saved training")):
        os.mkdir(os.path.join(savedir, "saved training"))
        print('creating directory %s' %
              (os.path.join(savedir, "saved training")))

    print('XField type: %s' % (args.type))
    print('Dimension of input xfield: %s' % (args.dim))

    #loading images
    images, coordinates, all_pairs, h_res, w_res = load_imgs(args)

    dims = args.dim
    num_n = args.num_n  # number of neighbors
    min_ = np.min(coordinates)
    max_ = np.max(coordinates)

    print('\n ------- Creating the model -------')

    # batch size is num_n + 1 (number of neighbors + target)
    inputs = tf.placeholder(tf.float32, shape=[num_n + 1, 1, 1, len(dims)])

    # Jacobian network
    num_output = len(args.type) * 2

    with tf.variable_scope("gen_flows"):
        flows = Flow(inputs, h_res, w_res, num_output, args.nfg, min_, max_)

    nparams_decoder = np.sum([
        np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        if v.name.startswith("gen_flows")
    ])
    print('Number of learnable parameters (decoder): %d' % (nparams_decoder))

    # learnt albedo
    # The albedos are initialized with constant 1.0
    if args.type == ['light', 'view', 'time']:

        with tf.variable_scope("gen_flows"):

            # For light-view-time interpolation, we consider num_views*num_times albedos
            albedos = tf.Variable(tf.constant(
                1.0, shape=[dims[1] * dims[2], h_res, w_res, 3]),
                                  name='albedo')
            index_albedo = tf.placeholder(tf.int32, shape=(1, ))
            albedo = tf.gather(albedos, index_albedo, 0)

        nparams = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
            if v.name.startswith("gen_flows")
        ])
        print(
            'Number of learnable parameters (%d albedos with res %d x %d ): %d'
            % (dims[1] * dims[2], h_res, w_res, nparams - nparams_decoder))

    elif args.type == ['light']:

        with tf.variable_scope("gen_flows"):
            # For light interpolation, we consider just one albedo
            albedo = tf.Variable(tf.constant(1.0, shape=[1, h_res, w_res, 3]),
                                 name='albedo')

        nparams = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
            if v.name.startswith("gen_flows")
        ])
        print(
            'Number of learnable parameters (%d albedos with res %d x %d ): %d'
            % (1, h_res, w_res, nparams - nparams_decoder))

    else:
        # For view and time interpolation, we do not train for albedo, we consider it as a constant non-learnable parameter
        albedo = tf.constant(1.0, shape=[1, h_res, w_res, 3])

    Neighbors = tf.placeholder(tf.float32, shape=[num_n, h_res, w_res, 3])

    # soft blending
    interpolated = Blending_train(inputs, Neighbors, flows, albedo, h_res,
                                  w_res, args)

    Reference = tf.placeholder(tf.float32, shape=[1, h_res, w_res, 3])

    # L1 loss
    loss = tf.reduce_mean((tf.abs(interpolated - Reference)))

    gen_tvars = [
        var for var in tf.trainable_variables()
        if var.name.startswith("gen_flows")
    ]
    learning_rate = tf.placeholder(tf.float32, shape=())
    gen_optim = tf.train.AdamOptimizer(learning_rate)
    gen_grads = gen_optim.compute_gradients(loss, var_list=gen_tvars)
    gen_train = gen_optim.apply_gradients(gen_grads)

    saver = tf.train.Saver(max_to_keep=1000)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    if args.load_pretrained:
        ckpt = tf.train.get_checkpoint_state("%s/trained model" % (savedir))
        if ckpt:
            print('\n loading pretrained model  ' + ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

    print('------------ Start Training ------------')

    lr = args.lr
    print('Starting learning rate with %0.4f' % (lr))

    stop_l1_thr = 0.01

    iter_end = 100000  # total number of iterations

    indices = np.array([i for i in range(len(all_pairs))])
    if len(indices
           ) < 500:  # we considered around 500 iterations per each epoch
        indices = np.repeat(indices, 500 // len(indices))

    epoch_size = len(indices)
    epoch_end = iter_end // epoch_size  # total number of epochs

    if args.type == ['light', 'view', 'time']:

        st = time.time()
        min_loss = 1000
        l1_loss_t = 1
        epoch = 0

        while l1_loss_t > stop_l1_thr and epoch <= epoch_end:

            l1_loss_t = 0
            np.random.shuffle(indices)

            for id in range(epoch_size):

                pair = all_pairs[indices[id], ::]

                input_coords = coordinates[pair[:num_n + 1], ::]
                reference_img = images[pair[:1], ::]
                Neighbors_img = images[pair[1:num_n + 1], ::]
                _index = [pair[-1]]

                _, l1loss = sess.run(
                    [gen_train, loss],
                    feed_dict={
                        inputs: input_coords,
                        Reference: reference_img,
                        Neighbors: Neighbors_img,
                        learning_rate: lr,
                        index_albedo: _index
                    })
                l1_loss_t = l1_loss_t + l1loss

                print(
                    '\r Epoch %3.0d  Iteration %3.0d of %3.0d   Cumulative L1 loss = %3.3f'
                    % (epoch, id + 1, epoch_size, l1_loss_t),
                    end=" ")
                wandb.log({'Cumulative L1 loss': l1_loss_t})

            l1_loss_t = l1_loss_t / epoch_size
            print(" elapsed time %3.1f m  Averaged L1 loss = %3.5f " %
                  ((time.time() - st) / 60, l1_loss_t))
            wandb.log({'epoch': epoch, 'Averaged L1 loss': l1_loss_t})

            if l1_loss_t < min_loss:
                saver.save(sess, "%s/trained model/model.ckpt" % (savedir))
                min_loss = l1_loss_t

            center = np.prod(dims) // 2
            cv2.imwrite("%s/saved training/reference.png" % (savedir),
                        np.uint8(images[center, ::] * 255))

            pair = all_pairs[3 * center + 0, ::]

            out_img, flows_out = sess.run(
                [interpolated, flows],
                feed_dict={
                    inputs: coordinates[pair[:num_n + 1], ::],
                    Neighbors: images[pair[1:num_n + 1], ::],
                    index_albedo: [pair[-1]]
                })

            out_img = np.minimum(np.maximum(out_img, 0.0), 1.0)
            cv2.imwrite("%s/saved training/recons_light.png" % (savedir),
                        np.uint8(out_img[0, ::] * 255))
            wandb.log({
                'Reconstructed Light': [
                    wandb.Image("%s/saved training/recons_light.png" %
                                (savedir))
                ]
            })

            flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 0:2],
                                                convert_to_bgr=False)
            cv2.imwrite("%s/saved training/flow_light.png" % (savedir),
                        np.uint8(flow_color))
            wandb.log({
                'Flow Light':
                [wandb.Image("%s/saved training/flow_light.png" % (savedir))]
            })

            flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 2:4],
                                                convert_to_bgr=False)
            cv2.imwrite("%s/saved training/flow_view.png" % (savedir),
                        np.uint8(flow_color))
            wandb.log({
                'Flow View':
                [wandb.Image("%s/saved training/flow_view.png" % (savedir))]
            })

            flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 4:6],
                                                convert_to_bgr=False)
            cv2.imwrite("%s/saved training/flow_time.png" % (savedir),
                        np.uint8(flow_color))
            wandb.log({
                'Flow Time':
                [wandb.Image("%s/saved training/flow_time.png" % (savedir))]
            })

            pair = all_pairs[3 * center + 1, ::]
            out_img = sess.run(interpolated,
                               feed_dict={
                                   inputs: coordinates[pair[:num_n + 1], ::],
                                   Neighbors: images[pair[1:num_n + 1], ::],
                                   index_albedo: [pair[-1]]
                               })

            out_img = np.minimum(np.maximum(out_img, 0.0), 1.0)
            cv2.imwrite("%s/saved training/recons_view.png" % (savedir),
                        np.uint8(out_img[0, ::] * 255))
            wandb.log({
                'Reconstructed View':
                [wandb.Image("%s/saved training/recons_view.png" % (savedir))]
            })

            pair = all_pairs[3 * center + 2, ::]
            out_img = sess.run(interpolated,
                               feed_dict={
                                   inputs: coordinates[pair[:num_n + 1], ::],
                                   Neighbors: images[pair[1:num_n + 1], ::],
                                   index_albedo: [pair[-1]]
                               })

            out_img = np.minimum(np.maximum(out_img, 0.0), 1.0)
            cv2.imwrite("%s/saved training/recons_time.png" % (savedir),
                        np.uint8(out_img[0, ::] * 255))
            wandb.log({
                'Reconstructed Time':
                [wandb.Image("%s/saved training/recons_time.png" % (savedir))]
            })
            epoch = epoch + 1

            if epoch == epoch_end // 2:
                lr = 0.00005

    if args.type == ['view'] or args.type == ['time'
                                              ] or args.type == ['light']:

        st = time.time()
        img_mov = cv2.VideoWriter(
            '%s/saved training/epoch_recons.mp4' % (savedir),
            cv2.VideoWriter_fourcc(*'mp4v'), 10, (w_res, h_res))
        flow_mov = cv2.VideoWriter(
            '%s/saved training/epoch_flows.mp4' % (savedir),
            cv2.VideoWriter_fourcc(*'mp4v'), 10, (w_res, h_res))

        min_loss = 1000
        l1_loss_t = 1
        epoch = 0

        while l1_loss_t > stop_l1_thr and epoch <= epoch_end:

            l1_loss_t = 0
            np.random.shuffle(indices)

            for id in range(epoch_size):

                pair = all_pairs[indices[id], ::]
                input_coords = coordinates[pair[:num_n + 1], ::]
                reference_img = images[pair[:1], ::]
                Neighbors_img = images[pair[1:num_n + 1], ::]

                _, l1loss = sess.run(
                    [gen_train, loss],
                    feed_dict={
                        inputs: input_coords,
                        Reference: reference_img,
                        Neighbors: Neighbors_img,
                        learning_rate: lr,
                    })

                l1_loss_t = l1_loss_t + l1loss
                print(
                    '\r Epoch %3.0d  Iteration %3.0d of %3.0d   Cumulative L1 loss = %3.3f'
                    % (epoch, id + 1, epoch_size, l1_loss_t),
                    end=" ")
                wandb.log({'Cumulative L1 loss': l1_loss_t})

            l1_loss_t = l1_loss_t / epoch_size
            print(" elapsed time %3.1f m  Averaged L1 loss = %3.5f" %
                  ((time.time() - st) / 60, l1_loss_t))
            wandb.log({'epoch': epoch, 'Averaged L1 loss': l1_loss_t})

            if l1_loss_t < min_loss:
                saver.save(sess, "%s/trained model/model.ckpt" % (savedir))
                min_loss = l1_loss_t

            if args.type == ['light']:

                albedo_out = np.minimum(np.maximum(sess.run(albedo), 0.0), 1.0)
                cv2.imwrite("%s/saved training/albedo.png" % (savedir),
                            np.uint8(albedo_out[0, :, :, :] * 255))
                wandb.log({
                    'Albedo':
                    [wandb.Image("%s/saved training/albedo.png" % (savedir))]
                })

            center = np.prod(dims) // 2
            cv2.imwrite("%s/saved training/reference.png" % (savedir),
                        np.uint8(images[center, ::] * 255))
            wandb.log({
                'Reference':
                [wandb.Image("%s/saved training/reference.png" % (savedir))]
            })

            pair = all_pairs[(len(all_pairs) // len(images)) * center, ::]

            out_img, flows_out = sess.run(
                [interpolated, flows],
                feed_dict={
                    inputs: coordinates[pair[:num_n + 1], ::],
                    Neighbors: images[pair[1:num_n + 1], ::]
                })

            out_img = np.minimum(np.maximum(out_img, 0.0), 1.0)
            cv2.imwrite("%s/saved training/recons.png" % (savedir),
                        np.uint8(out_img[0, ::] * 255))
            wandb.log({
                'Reconstruction':
                [wandb.Image("%s/saved training/recons.png" % (savedir))]
            })

            flow_color = flow_vis.flow_to_color(flows_out[0, :, :, 0:2],
                                                convert_to_bgr=False)
            cv2.imwrite("%s/saved training/flow.png" % (savedir),
                        np.uint8(flow_color))
            wandb.log({
                'Flow':
                [wandb.Image("%s/saved training/flow.png" % (savedir))]
            })

            img_mov.write(np.uint8(out_img[0, ::] * 255))
            flow_mov.write(np.uint8(flow_color))
            epoch = epoch + 1

            if epoch == epoch_end // 2:
                lr = 0.00005

        img_mov.release()
        flow_mov.release()

        wandb.log({
            "epoch_recons":
            wandb.Video('%s/saved training/epoch_recons.mp4' % (savedir),
                        fps=4,
                        format="gif")
        })
        wandb.log({
            "epoch_flows":
            wandb.Video('%s/saved training/epoch_flows.mp4' % (savedir),
                        fps=4,
                        format="gif")
        })
Пример #8
0
    # Commence training
    model = transfer_utils.train_model(model,
                                       dataloaders,
                                       dataset_sizes,
                                       class_names,
                                       criterion,
                                       optimizer,
                                       scheduler,
                                       num_epochs=args.epochs,
                                       curr_epoch=curr_epoch,
                                       checkpoint_dir=args.checkpoint_dir)


if __name__ == "__main__":
    wandb.init(project="tm-poverty-prediction")
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser(
        description='Philippine Poverty Prediction')
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        metavar='N',
                        help='input batch size for training (default: 32)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-6,
                        metavar='LR',
                        help='learning rate (default: 1e-6)')
    parser.add_argument('--epochs',
Пример #9
0
def run_training(command_history: utils.CommandHistory,
                 game_params: GameParams,
                 model_params: ModelParams,
                 optim_params: OptimParams,
                 simulation_params: SimulationParams,
                 execution_params: ExecutionParams,
                 run_group="Default Group") -> None:
    wandb.init(project="thesis-az", group=run_group)
    cfg = {
        **asdict(game_params),
        **asdict(model_params),
        **asdict(optim_params),
        **asdict(simulation_params),
        **asdict(execution_params),
    }

    wandb.config.update(cfg)
    start_time = time.time()
    logger_path = os.path.join(execution_params.checkpoint_dir, "train.log")
    sys.stdout = utils.Logger(logger_path)

    print("#" * 70)
    print("#" + "TRAINING".center(68) + "#")
    print("#" * 70)

    print("setting-up pseudo-random generator...")
    seed_generator = utils.generate_random_seeds(seed=execution_params.seed)

    # checkpoint, resume from where it stops
    epoch = 0
    ckpts = list(
        utils.gen_checkpoints(checkpoint_dir=execution_params.checkpoint_dir,
                              only_last=True,
                              real_time=False))
    checkpoint = {}
    if ckpts:
        checkpoint = ckpts[0]
        former_command_history = checkpoint["command_history"]
        command_history.build_history(former_command_history)
        optim_params = command_history.update_params_from_checkpoint(
            checkpoint_params=checkpoint["optim_params"],
            resume_params=optim_params)
        simulation_params = command_history.update_params_from_checkpoint(
            checkpoint_params=checkpoint["simulation_params"],
            resume_params=simulation_params,
        )
        execution_params = command_history.update_params_from_checkpoint(
            checkpoint_params=checkpoint["execution_params"],
            resume_params=execution_params,
        )
    if command_history.last_command_contains("init_checkpoint"):
        if ckpts:
            raise RuntimeError(
                "Cannot restart from init_checkpoint, already restarting from non-empty checkpoint_dir"
            )
        # pretrained model, consider new training from epoch zero
        print("loading pretrained model from checkpoint...")
        checkpoint = utils.load_checkpoint(
            checkpoint_path=model_params.init_checkpoint)
    if checkpoint:
        # game_params and model_params cannot change on a checkpoint
        # either write the same, or don't specify them
        ignored = {"init_checkpoint", "game_name"}  # this one can change
        current_params = dict(game_params=game_params,
                              model_params=model_params)
        for params_name, params in current_params.items():
            for attr, val in asdict(params).items():
                if command_history.last_command_contains(
                        attr) and attr not in ignored:
                    ckpt_val = getattr(checkpoint[params_name], attr)
                    assert val == ckpt_val, f"When resuming, got '{val}' for {attr} but cannot override from past run with '{ckpt_val}'."
        specified_game_name = game_params.game_name
        game_params = checkpoint["game_params"]
        if specified_game_name is not None:
            game_params.game_name = specified_game_name
        model_params = checkpoint["model_params"]
        epoch = checkpoint["epoch"]
        print("reconstructing the model...")
    else:
        print("creating and saving the model...")
    train_device = execution_params.device[0]
    game_generation_devices = ([train_device] if len(execution_params.device)
                               == 1 else execution_params.device[1:])
    train_device = torch.device(train_device)
    model = create_model(
        game_params=game_params,
        model_params=model_params,
        resume_training=bool(checkpoint),
        model_state_dict=checkpoint["model_state_dict"]
        if checkpoint else None,
    ).to(train_device)
    model_path = execution_params.checkpoint_dir / "model.pt"
    model.save(str(model_path))

    ddpmodel = None

    if execution_params.ddp:
        torch.distributed.init_process_group(backend="nccl")
        ddpmodel = nn.parallel.DistributedDataParallel(
            ModelWrapperForDDP(model))

    print("creating optimizer...")
    optim = create_optimizer(
        model=model,
        optim_params=optim_params,
        optim_state_dict=checkpoint.get("optim_state_dict", None),
    )

    print("creating training environment...")
    context, assembler, get_train_reward = create_training_environment(
        seed_generator=seed_generator,
        model_path=model_path,
        game_generation_devices=game_generation_devices,
        game_params=game_params,
        simulation_params=simulation_params,
        execution_params=execution_params)
    assembler.update_model(model.state_dict())
    assembler.add_tournament_model("init", model.state_dict())
    context.start()

    print("warming-up replay buffer...")
    warm_up_replay_buffer(
        assembler=assembler,
        replay_warmup=simulation_params.replay_warmup,
        replay_buffer=checkpoint.get("replay_buffer", None),
    )

    print("training model...")
    train_model(command_history=command_history,
                start_time=start_time,
                train_device=train_device,
                model=model,
                ddpmodel=ddpmodel,
                model_path=model_path,
                optim=optim,
                context=context,
                assembler=assembler,
                get_train_reward=get_train_reward,
                game_params=game_params,
                model_params=model_params,
                optim_params=optim_params,
                simulation_params=simulation_params,
                execution_params=execution_params,
                epoch=epoch)

    elapsed_time = time.time() - start_time
    print(f"total time: {elapsed_time} s")
Пример #10
0
import tensorflow.keras.layers as tfkl
import tensorflow.keras.backend as K
from keras.layers import GlobalAveragePooling2D, Dense, Flatten

from sklearn.preprocessing import LabelBinarizer
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, InputLayer, Activation
from keras.models import Sequential
from keras import optimizers
from postprocess import Postprocess
import params
import requests
import pandas as pd
import pickle
#Import wandb libraries
import wandb
wandb.init(project="vgg_training_03")
from wandb.keras import WandbCallback


def telegram_bot_sendtext(bot_message):

    bot_token = '1153335989:AAE4v1w9FD_vCUaG2qcq-WmuPwh_MBYWWho'
    bot_chatID = '675791133'
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message

    response = requests.get(send_text)

    return response.json()


def VGGish(pump=None,
Пример #11
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )
    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets:

    train_dataset = datasets.load_dataset("corpora/com_voice_sex_corpus",
                                          split="train",
                                          cache_dir=model_args.cache_dir)
    eval_dataset = datasets.load_dataset("corpora/com_voice_sex_corpus",
                                         split="test",
                                         cache_dir=model_args.cache_dir)

    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                                 sampling_rate=16_000,
                                                 padding_value=0.0,
                                                 do_normalize=True,
                                                 return_attention_mask=True)
    processor = CustomWav2Vec2Processor(feature_extractor=feature_extractor)
    model = Wav2Vec2CommVoiceGenderModel.from_pretrained(
        "facebook/wav2vec2-large-xlsr-53",
        attention_dropout=0.01,
        hidden_dropout=0.01,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.01,
        gradient_checkpointing=True,
    )

    if model_args.freeze_feature_extractor:
        model.freeze_feature_extractor()

    if data_args.max_train_samples is not None:
        train_dataset = train_dataset.select(range(
            data_args.max_train_samples))

    if data_args.max_val_samples is not None:
        eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

    # Preprocessing the datasets.
    # We need to read the aduio files as arrays and tokenize the targets.
    def speech_file_to_array_fn(batch):
        start = 0
        stop = 10
        srate = 16_000
        speech_array, sampling_rate = torchaudio.load(batch["file"])
        speech_array = speech_array[0].numpy()[:stop * sampling_rate]
        batch["speech"] = librosa.resample(np.asarray(speech_array),
                                           sampling_rate, srate)
        batch["sampling_rate"] = srate
        batch["parent"] = batch["label"]
        return batch

    train_dataset = train_dataset.map(
        speech_file_to_array_fn,
        remove_columns=train_dataset.column_names,
        num_proc=data_args.preprocessing_num_workers,
    )
    eval_dataset = eval_dataset.map(
        speech_file_to_array_fn,
        remove_columns=eval_dataset.column_names,
        num_proc=data_args.preprocessing_num_workers,
    )

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values
        batch["labels"] = batch["parent"]
        return batch

    train_dataset = train_dataset.map(
        prepare_dataset,
        remove_columns=train_dataset.column_names,
        batch_size=training_args.per_device_train_batch_size,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
    )
    eval_dataset = eval_dataset.map(
        prepare_dataset,
        remove_columns=eval_dataset.column_names,
        batch_size=training_args.per_device_train_batch_size,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
    )

    from sklearn.metrics import classification_report, confusion_matrix

    def compute_metrics(pred):
        label_idx = [0, 1]
        label_names = ['female', 'male']
        labels = pred.label_ids.argmax(-1)
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average='macro')
        report = classification_report(y_true=labels,
                                       y_pred=preds,
                                       labels=label_idx,
                                       target_names=label_names)
        matrix = confusion_matrix(y_true=labels, y_pred=preds)
        print(report)
        print(matrix)

        wandb.log({
            "conf_mat":
            wandb.plot.confusion_matrix(probs=None,
                                        y_true=labels,
                                        preds=preds,
                                        class_names=label_names)
        })

        wandb.log({
            "precision_recall":
            wandb.plot.pr_curve(y_true=labels,
                                y_probas=pred.predictions,
                                labels=label_names)
        })

        return {"accuracy": acc, "f1_score": f1}

    wandb.init(name=training_args.output_dir, config=training_args)

    # Data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor,
                                               padding=True)

    # Initialize our Trainer
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=processor.feature_extractor,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()

        # save the feature_extractor and the tokenizer
        if is_main_process(training_args.local_rank):
            processor.save_pretrained(training_args.output_dir)

        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate()
        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    return results
Пример #12
0
#WGAN
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torchvision.utils import save_image

import wandb

wandb.init(job_type='train', project='WGAN', name='WGAN')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bs = 500

# transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

train_dataset = datasets.FashionMNIST(root='./fashion_mnist_data',
                                      train=True,
                                      transform=transforms.ToTensor(),
                                      download=True)
test_dataset = datasets.FashionMNIST(root='./fashion_mnist_data',
                                     train=False,
                                     transform=transforms.ToTensor(),
                                     download=False)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=bs,
Пример #13
0
            tf.concat([train_inputs['X'], train_inputs['X2']], axis=2))
        dT_train.append(train_inputs['target'])
        dX_test.append(test_inputs)
        dX_scaler.append(y_scaler)
    global_inputs_X = tf.concat(dX_train, 0)
    global_inputs_T = tf.concat(dT_train, 0)
    print('done with data')
    working = '.models/' + dset + '_models/global/trials'

    # 1️⃣ Start a new run, tracking config metadata
    run = wandb.init(project="3days_forcast",
                     config={
                         'layers': LAYERS,
                         'dropout': DROPOUT,
                         'neurons': NEURONS,
                         'learning rate': LR,
                         'batch_size': BATCHSIZE,
                         "architecture": "RNN with forward lags for temporal",
                         "dataset": "Columbia",
                         "epochs": MAX_EPOCHS,
                         'patience': PATIENCE
                     })
    config = wandb.config

    # full data LSTM MIMO compilation and fit
    LSTMIMO = build_model(l=LAYERS, drop=DROPOUT, n=NEURONS, lr=LR)

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=PATIENCE,
                                                      mode='min')

    history = LSTMIMO.fit(global_inputs_X,
Пример #14
0
    def train(
        self,
        train_dataloader,
        output_dir,
        show_running_loss=True,
        eval_dataloader=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])

        t_total = len(train_dataloader) // args[
            "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args["learning_rate"],
                          eps=args["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"])
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores(
                **kwargs)

        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for _ in train_iterator:
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

                (lm_loss), (mc_loss), *_ = model(
                    input_ids,
                    token_type_ids=token_type_ids,
                    mc_token_ids=mc_token_ids,
                    mc_labels=mc_labels,
                    lm_labels=lm_labels,
                )
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = lm_loss * args["lm_coef"] + mc_loss * args["mc_coef"]

                if args["n_gpu"] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % current_loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     amp.master_params(optimizer), args["max_grad_norm"]
                    # )
                else:
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     model.parameters(), args["max_grad_norm"]
                    # )

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args["max_grad_norm"])

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args["logging_steps"],
                                             global_step)
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current, model=model)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(
                            eval_dataloader,
                            verbose=verbose
                            and args["evaluate_during_training_verbose"],
                            silent=True,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current,
                                             model=model,
                                             results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args["output_dir"],
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args["early_stopping_metric"]]
                            self._save_model(args["best_model_dir"],
                                             model=model,
                                             results=results)
                        if best_eval_metric and args[
                                "early_stopping_metric_minimize"]:
                            if (results[args["early_stopping_metric"]] -
                                    best_eval_metric <
                                    args["early_stopping_delta"]):
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if (results[args["early_stopping_metric"]] -
                                    best_eval_metric >
                                    args["early_stopping_delta"]):
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args[
                    "evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current, model=model)

            if args["evaluate_during_training"]:
                results, _, _ = self.eval_model(
                    eval_dataloader,
                    verbose=verbose
                    and args["evaluate_during_training_verbose"],
                    silent=True,
                    **kwargs,
                )

                self._save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args["output_dir"],
                                           "training_progress_scores.csv"),
                              index=False)

                if args["wandb_project"]:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args["early_stopping_metric"]]
                    self._save_model(args["best_model_dir"],
                                     model=model,
                                     results=results)
                if best_eval_metric and args["early_stopping_metric_minimize"]:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric < args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0
                else:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric > args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0

        return global_step, tr_loss / global_step
Пример #15
0
        )

    if args.mirror_augment:
        transform = transforms.Compose(
            [
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True),
            ]
        )
    else:
        transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True),
            ]
        )

    dataset = MultiResolutionDataset(args.path, transform, args.size, args.use_label, metadata, categories)
    loader = data.DataLoader(
        dataset,
        batch_size=args.batch,
        sampler=data_sampler(dataset, shuffle=True, distributed=args.distributed),
        drop_last=True,
    )

    if get_rank() == 0 and wandb is not None and args.wandb:
        wandb.init(project='stylegan 2')

    train(args, loader, generator, discriminator, g_optim, d_optim, g_ema, device)
Пример #16
0
    def __init__(self, gpu, world_size, dataset, batch_size, lr,
                 mom, lambd, model, max_epoch, client_epoch, seed, exp_id,
                 early_stop_round, early_stop_metric):
        super().__init__()

        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True

        self.iid = False

        logger.add(f"logs/asyncfed/{world_size}_{dataset}_{batch_size}_{lr}"
                   f"_{mom}_{lambd}_{model}_{max_epoch}_{client_epoch}_PS{exp_id}.log")
        if wandb_enable:
            wandb.init(project="Async_FedAvg",
                       name=f"async_{world_size}_{batch_size}_{max_epoch}_{client_epoch}_{lr}_{lambd}"
                            f"_{mom}_{dataset}_{model}_{'iid' if self.iid else 'noniid'}",
                       config={
                "method": "async",
                "world size": world_size,
                "dataset": dataset,
                "iid": self.iid,
                "model": model,
                "batch size": batch_size,
                "learning rate": lr,
                "momentum": mom,
                "lambda": lambd,
                "global epoch": max_epoch,
                "client epoch": client_epoch,
                "seed": seed,
                "mom_metho": "normal",
            })

        self.max_epoch = max_epoch * client_epoch
        self.client_epoch = client_epoch
        self.world_size = world_size
        self.mom = mom
        self.device = f"cuda:{gpu}" if torch.cuda.is_available() else "cpu"
        if dataset == "cifar100":
            class_num = 100
        elif dataset == "emnist":
            class_num = 62
        else:
            class_num = 10
        self.model_name = model
        self.model = load_model(model, class_num=class_num).to(self.device)
        self.lr = lr
        self.lambd = lambd
        self.aggregation = [DataAggregation(r) for r in range(1, world_size)]
        self.embedding_list = []

        self.dyn_task = np.array([0. for _ in range(self.world_size - 1)])
        self.dyn_timer = np.array([0. for _ in range(self.world_size - 1)])

        self.client_counter = 0
        self.wtminus1 = {}
        self.mom_buffer = {}
        self.gminus1 = {}
        self.broadcast_fut_all = None
        self.cluster_is_ready = True

        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.)
        _, self.test_loader = partition_dataset(dataset, world_size - 1, 0, batch_size, seed, iid=self.iid)

        self.acc_list = []
        self.early_stop_round = early_stop_round
        self.early_stop_metric = early_stop_metric
Пример #17
0
def train(
    run_name: str,
    # Data
    train_filepath: str,
    eval_filepath: str,
    type_vocab_filepath: str,
    spm_filepath: str,
    num_workers=1,
    max_seq_len=1024,
    max_eval_seq_len=1024,
    # Model
    resume_path: str = "",
    pretrain_resume_path: str = "",
    pretrain_resume_encoder_name: str = "encoder_q",  # encoder_q, encoder_k, encoder
    pretrain_resume_project: bool = False,
    no_output_attention: bool = False,
    encoder_type: str = "transformer",
    n_encoder_layers: int = 6,
    d_model: int = 512,
    # Optimization
    num_epochs: int = 100,
    save_every: int = 2,
    batch_size: int = 256,
    lr: float = 8e-4,
    adam_beta1: float = 0.9,
    adam_beta2: float = 0.98,
    adam_eps: float = 1e-6,
    weight_decay: float = 0,
    warmup_steps: int = 5000,
    num_steps: int = 200000,
    # Loss
    subword_regularization_alpha: float = 0,
    ignore_any_loss: bool = False,
    # Computational
    use_cuda: bool = True,
    seed: int = 1,
):
    """Train model"""
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    run_dir = RUN_DIR / run_name
    run_dir.mkdir(exist_ok=True, parents=True)
    logger.add(str((run_dir / "train.log").resolve()))
    logger.info(f"Saving logs, model checkpoints to {run_dir}")
    config = locals()
    logger.info(f"Config: {config}")
    wandb.init(name=run_name, config=config, job_type="training", project="type_prediction", entity="ml4code")

    if use_cuda:
        assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False"

    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)
    pad_id = sp.PieceToId("[PAD]")

    id_to_target, target_to_id = load_type_vocab(type_vocab_filepath)
    no_type_id = target_to_id["O"]
    assert no_type_id == 0  # Just a sense check since O is the first line in the vocab file
    any_id = target_to_id["$any$"]

    collate_fn = get_collate_fn(pad_id, no_type_id)

    # Create training dataset and dataloader
    logger.info(f"Training data path {train_filepath}")
    train_dataset = DeepTyperDataset(
        train_filepath, type_vocab_filepath, spm_filepath, max_length=max_seq_len, subword_regularization_alpha=subword_regularization_alpha
    )
    logger.info(f"Training dataset size: {len(train_dataset)}")
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, collate_fn=collate_fn
    )

    # Create eval dataset and dataloader
    logger.info(f"Eval data path {eval_filepath}")
    eval_dataset = DeepTyperDataset(
        eval_filepath,
        type_vocab_filepath,
        spm_filepath,
        max_length=max_eval_seq_len,
        subword_regularization_alpha=subword_regularization_alpha,
        split_source_targets_by_tab=eval_filepath.endswith(".json")
    )
    logger.info(f"Eval dataset size: {len(eval_dataset)}")
    eval_loader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn
    )

    # Create model
    model = TypeTransformer(n_tokens=sp.GetPieceSize(), n_output_tokens=len(id_to_target), pad_id=pad_id,
        encoder_type=encoder_type, n_encoder_layers=n_encoder_layers, d_model=d_model)
    logger.info(f"Created TypeTransformer {encoder_type} with {count_parameters(model)} params")

    # Load pretrained checkpoint
    if pretrain_resume_path:
        assert not resume_path
        logger.info(f"Resuming training from pretraining checkpoint {pretrain_resume_path}, pretrain_resume_encoder_name={pretrain_resume_encoder_name}")
        checkpoint = torch.load(pretrain_resume_path)
        pretrained_state_dict = checkpoint["model_state_dict"]
        encoder_state_dict = {}
        output_state_dict = {}
        assert pretrain_resume_encoder_name in ["encoder_k", "encoder_q", "encoder"]

        for key, value in pretrained_state_dict.items():
            if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer" not in key:
                remapped_key = key[len(pretrain_resume_encoder_name + ".") :]
                logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}")
                encoder_state_dict[remapped_key] = value
            if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer.0." in key and pretrain_resume_project:
                remapped_key = key[len(pretrain_resume_encoder_name + ".project_layer.") :]
                logger.debug(f"Remapping checkpoint project key {key} to output key {remapped_key}. Value mean: {value.mean().item()}")
                output_state_dict[remapped_key] = value
        model.encoder.load_state_dict(encoder_state_dict)
        # TODO: check for head key rather than output for MLM
        model.output.load_state_dict(output_state_dict, strict=False)
        logger.info(f"Loaded state dict from {pretrain_resume_path}")

    # Set up optimizer
    model = nn.DataParallel(model)
    model = model.cuda() if use_cuda else model
    wandb.watch(model, log="all")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(adam_beta1, adam_beta2), eps=adam_eps, weight_decay=weight_decay)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, num_steps)
    epoch = 0
    global_step = 0
    min_eval_metric = float("inf")

    if resume_path:
        assert not pretrain_resume_path
        logger.info(f"Resuming training from checkpoint {resume_path}")
        checkpoint = torch.load(resume_path)
        model.module.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        epoch = checkpoint["epoch"]
        global_step = checkpoint["global_step"]
        min_eval_metric = checkpoint["min_eval_metric"]

    # Evaluate initial metrics
    logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...")
    eval_metric, eval_metrics = _evaluate(model, eval_loader, sp, target_to_id=target_to_id, use_cuda=use_cuda, no_output_attention=no_output_attention)
    for metric, value in eval_metrics.items():
        logger.info(f"Evaluation {metric} after epoch {epoch} ({global_step} steps): {value:.4f}")
    eval_metrics["epoch"] = epoch
    wandb.log(eval_metrics, step=global_step)

    for epoch in tqdm.trange(epoch + 1, num_epochs + 1, desc="training", unit="epoch", leave=False):
        logger.info(f"Starting epoch {epoch}\n")
        model.train()
        pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}")
        for X, lengths, output_attn, labels in pbar:
            if use_cuda:
                X, lengths, output_attn, labels = X.cuda(), lengths.cuda(), output_attn.cuda(), labels.cuda()
            optimizer.zero_grad()
            if no_output_attention:
                logits = model(X, lengths, None)  # BxLxVocab
            else:
                logits = model(X, lengths, output_attn)  # BxLxVocab
            if ignore_any_loss:
                # Don't train with $any$ type
                labels_ignore_any = labels.clone()
                labels_ignore_any[labels_ignore_any == any_id] = no_type_id
                loss = F.cross_entropy(logits.transpose(1, 2), labels_ignore_any, ignore_index=no_type_id)
            else:
                loss = F.cross_entropy(logits.transpose(1, 2), labels, ignore_index=no_type_id)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Compute accuracy in training batch
            (corr1_any, corr5_any), num_labels_any = accuracy(logits, labels, topk=(1, 5), ignore_idx=(no_type_id,))
            acc1_any, acc5_any = corr1_any / num_labels_any * 100, corr5_any / num_labels_any * 100
            (corr1, corr5), num_labels = accuracy(logits, labels, topk=(1, 5), ignore_idx=(no_type_id, any_id))
            acc1, acc5 = corr1 / num_labels * 100, corr5 / num_labels * 100

            # Log loss
            global_step += 1
            wandb.log(
                {
                    "epoch": epoch,
                    "train/loss": loss.item(),
                    "train/acc@1": acc1,
                    "train/acc@5": acc5,
                    "train/acc@1_any": acc1_any,
                    "train/acc@5_any": acc5_any,
                    "lr": scheduler.get_last_lr()[0],
                },
                step=global_step,
            )
            pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}")

        # Evaluate
        logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...")
        eval_metric, eval_metrics = _evaluate(
            model, eval_loader, sp, target_to_id=target_to_id, use_cuda=use_cuda, no_output_attention=no_output_attention)
        for metric, value in eval_metrics.items():
            logger.info(f"Evaluation {metric} after epoch {epoch} ({global_step} steps): {value:.4f}")
        eval_metrics["epoch"] = epoch
        wandb.log(eval_metrics, step=global_step)

        # Save checkpoint
        if save_every and epoch % save_every == 0 or eval_metric < min_eval_metric:
            checkpoint = {
                "model_state_dict": model.module.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch,
                "global_step": global_step,
                "config": config,
                "eval_metric": eval_metric,
                "min_eval_metric": min_eval_metric
            }
            if eval_metric < min_eval_metric:
                logger.info(f"New best evaluation metric: prev {min_eval_metric:.4f} > new {eval_metric:.4f}")
                min_eval_metric = eval_metric
                model_file = run_dir / f"ckpt_best.pth"
            else:
                model_file = run_dir / f"ckpt_ep{epoch:04d}.pth"
            logger.info(f"Saving checkpoint to {model_file}...")
            torch.save(checkpoint, str(model_file.resolve()))
            logger.info("Done.")
Пример #18
0
    parser.add_argument('--policy-frequency', type=int, default=2,
                        help="the frequency of training policy (delayed)")
    parser.add_argument('--noise-clip', type=float, default=0.5,
                         help='noise clip parameter of the Target Policy Smoothing Regularization')
    args = parser.parse_args()
    if not args.seed:
        args.seed = int(time.time())

# TRY NOT TO MODIFY: setup the environment
experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
writer = SummaryWriter(f"runs/{experiment_name}")
writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
        '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))
if args.prod_mode:
    import wandb
    wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")


# TRY NOT TO MODIFY: seeding
device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
env = gym.make(args.gym_id)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.seed(args.seed)
env.action_space.seed(args.seed)
env.observation_space.seed(args.seed)
# respect the default timelimit
assert isinstance(env.action_space, Box), "only continuous action space is supported"
Пример #19
0
if n_gpu == 1:
    data_path = "./data"
elif n_gpu == 4:
    data_path = "./data"

save_im_path = "./g_z/" + run_name
if n_gpu == 1:
    save_checkpoints_path = "./checkpoints/" + run_name
elif n_gpu == 4:
    save_checkpoints_path = "/hpf/largeprojects/agoldenb/lechang/" + run_name

# load_checkpoint = "/hpf/largeprojects/agoldenb/lechang/trained-1600.pth"
load_checkpoint = "no"  # restart

wandb.init(project="mri_gan_cancer", name=run_name)

parser = argparse.ArgumentParser()
parser.add_argument('--batch-size',
                    type=str,
                    default=str(batch_size),
                    metavar='N',
                    help='')
parser.add_argument('--lr',
                    type=str,
                    default=str(learning_rate),
                    metavar='N',
                    help='')
parser.add_argument('--data_path',
                    type=str,
                    default=data_path,
Пример #20
0
def train(hyp, opt, device, tb_writer=None, wandb=None):
    logger.info(f'Hyperparameters {hyp}')
    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank

    # Directories
    wdir = save_dir / 'weights'
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = save_dir / 'results.txt'

    # Save run settings
    with open(save_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)

    # Configure
    plots = not opt.evolve  # create plots
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if opt.single_cls and len(
        data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (
        len(names), nc, opt.data)  # check

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        # with torch_distributed_zero_first(rank):
        #     attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        if hyp.get('anchors'):
            ckpt['model'].yaml['anchors'] = round(
                hyp['anchors'])  # force autoanchor
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3,
                      nc=nc).to(device)  # create
        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [
        ]  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict,
                                     model.state_dict(),
                                     exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info(
            'Transferred %g/%g items from %s' %
            (len(state_dict), len(model.state_dict()), weights))  # report
    else:
        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create

    # Freeze
    freeze = []  # parameter names to freeze (full or partial)
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            print('freezing %s' % k)
            v.requires_grad = False

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm2d):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
            pg1.append(v.weight)  # apply decay

    if opt.adam:
        optimizer = optim.Adam(pg0,
                               lr=hyp['lr0'],
                               betas=(hyp['momentum'],
                                      0.999))  # adjust beta1 to momentum
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)

    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' %
                (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
    lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # plot_lr_scheduler(optimizer, scheduler, epochs)

    # Logging
    if rank in [-1, 0] and wandb and wandb.run is None:
        opt.hyp = hyp  # add hyperparameters
        wandb_run = wandb.init(
            config=opt,
            resume="allow",
            project='YOLOv5'
            if opt.project == 'runs/train' else Path(opt.project).stem,
            name=save_dir.stem,
            id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
    loggers = {'wandb': wandb}  # loggers dict

    # Resume
    start_epoch, best_fitness = 0, 0.0
    if pretrained:
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # Results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # Epochs
        start_epoch = ckpt['epoch'] + 1
        if opt.resume:
            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (
                weights, epochs)
        if epochs < start_epoch:
            logger.info(
                '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                % (weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, state_dict

    # Image sizes
    gs = int(model.stride.max())  # grid size (max stride)
    nl = model.model[
        -1].nl  # number of detection layers (used for scaling hyp['obj'])
    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # DP mode
    if cuda and rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and rank != -1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        logger.info('Using SyncBatchNorm()')

    # EMA
    ema = ModelEMA(model) if rank in [-1, 0] else None

    # DDP mode
    if cuda and rank != -1:
        model = DDP(model,
                    device_ids=[opt.local_rank],
                    output_device=opt.local_rank)

    # Trainloader
    dataloader, dataset = create_dataloader(train_path,
                                            imgsz,
                                            batch_size,
                                            gs,
                                            opt,
                                            hyp=hyp,
                                            augment=True,
                                            cache=opt.cache_images,
                                            rect=opt.rect,
                                            rank=rank,
                                            world_size=opt.world_size,
                                            workers=opt.workers,
                                            image_weights=opt.image_weights,
                                            quad=opt.quad)
    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
    nb = len(dataloader)  # number of batches
    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
        mlc, nc, opt.data, nc - 1)

    # Process 0
    if rank in [-1, 0]:
        ema.updates = start_epoch * nb // accumulate  # set EMA updates
        testloader = create_dataloader(
            test_path,
            imgsz_test,
            total_batch_size,
            gs,
            opt,  # testloader
            hyp=hyp,
            cache=opt.cache_images and not opt.notest,
            rect=True,
            rank=-1,
            world_size=opt.world_size,
            workers=opt.workers,
            pad=0.5)[0]

        if not opt.resume:
            labels = np.concatenate(dataset.labels, 0)
            c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots:
                plot_labels(labels, save_dir, loggers)
                if tb_writer:
                    tb_writer.add_histogram('classes', c, 0)

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset,
                              model=model,
                              thr=hyp['anchor_t'],
                              imgsz=imgsz)

    # Model parameters
    hyp['cls'] *= nc / 80.  # scale hyp['cls'] to class count
    hyp['obj'] *= imgsz**2 / 640.**2 * 3. / nl  # scale hyp['obj'] to image size and output layers
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
    model.class_weights = labels_to_class_weights(
        dataset.labels, nc).to(device) * nc  # attach class weights
    model.names = names

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb),
             1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0, 0, 0
               )  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda)
    logger.info('Image sizes %g train, %g test\n'
                'Using %g dataloader workers\nLogging results to %s\n'
                'Starting training for %g epochs...' %
                (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs))
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if opt.image_weights:
            # Generate indices
            if rank in [-1, 0]:
                cw = model.class_weights.cpu().numpy() * (
                    1 - maps)**2 / nc  # class weights
                iw = labels_to_image_weights(dataset.labels,
                                             nc=nc,
                                             class_weights=cw)  # image weights
                dataset.indices = random.choices(
                    range(dataset.n), weights=iw,
                    k=dataset.n)  # rand weighted idx
            # Broadcast if DDP
            if rank != -1:
                indices = (torch.tensor(dataset.indices)
                           if rank == 0 else torch.zeros(dataset.n)).int()
                dist.broadcast(indices, 0)
                if rank != 0:
                    dataset.indices = indices.cpu().numpy()

        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(4, device=device)  # mean losses
        if rank != -1:
            dataloader.sampler.set_epoch(epoch)
        pbar = enumerate(dataloader)
        logger.info(
            ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls',
                                   'total', 'targets', 'img_size'))
        if rank in [-1, 0]:
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float(
            ) / 255.0  # uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / total_batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [
                        hyp['warmup_bias_lr'] if j == 2 else 0.0,
                        x['initial_lr'] * lf(epoch)
                    ])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(
                            ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
                          ]  # new shape (stretched to gs-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda):
                pred = model(imgs)  # forward
                # loss, loss_items = compute_loss(pred, targets.to(device), model)  # loss scaled by batch_size
                loss, loss_items = compute_loss_eiou(
                    pred, targets.to(device),
                    model)  # loss scaled by batch_size
                if rank != -1:
                    loss *= opt.world_size  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni % accumulate == 0:
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
                optimizer.zero_grad()
                if ema:
                    ema.update(model)

            # Print
            if rank in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1
                                                    )  # update mean losses
                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9
                                 if torch.cuda.is_available() else 0)  # (GB)
                s = ('%10s' * 2 +
                     '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem,
                                      *mloss, targets.shape[0], imgs.shape[-1])
                pbar.set_description(s)

                # Plot
                if plots and ni < 3:
                    f = save_dir / f'train_batch{ni}.jpg'  # filename
                    Thread(target=plot_images,
                           args=(imgs, targets, paths, f),
                           daemon=True).start()
                    # if tb_writer:
                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
                    #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
                elif plots and ni == 3 and wandb:
                    wandb.log({
                        "Mosaics": [
                            wandb.Image(str(x), caption=x.name)
                            for x in save_dir.glob('train*.jpg')
                        ]
                    })

            # end batch ------------------------------------------------------------------------------------------------
        # end epoch ----------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
        scheduler.step()

        # DDP process 0 or single-GPU
        if rank in [-1, 0]:
            # mAP
            if ema:
                ema.update_attr(model,
                                include=[
                                    'yaml', 'nc', 'hyp', 'gr', 'names',
                                    'stride', 'class_weights'
                                ])
            final_epoch = epoch + 1 == epochs
            if not opt.notest or final_epoch:  # Calculate mAP
                results, maps, times = test.test(
                    opt.data,
                    batch_size=total_batch_size,
                    imgsz=imgsz_test,
                    model=ema.ema,
                    single_cls=opt.single_cls,
                    dataloader=testloader,
                    save_dir=save_dir,
                    plots=plots and final_epoch,
                    log_imgs=opt.log_imgs if wandb else 0)

            # Write
            with open(results_file, 'a') as f:
                f.write(
                    s + '%10.4g' * 7 % results +
                    '\n')  # P, R, [email protected], [email protected], val_loss(box, obj, cls)
            if len(opt.name) and opt.bucket:
                os.system('gsutil cp %s gs://%s/results/results%s.txt' %
                          (results_file, opt.bucket, opt.name))

            # Log
            tags = [
                'train/box_loss',
                'train/obj_loss',
                'train/cls_loss',  # train loss
                'metrics/precision',
                'metrics/recall',
                'metrics/mAP_0.5',
                'metrics/mAP_0.5:0.95',
                'val/box_loss',
                'val/obj_loss',
                'val/cls_loss',  # val loss
                'x/lr0',
                'x/lr1',
                'x/lr2'
            ]  # params
            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
                if tb_writer:
                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
                if wandb:
                    wandb.log({tag: x})  # W&B

            # Update best mAP
            fi = fitness(np.array(results).reshape(
                1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness:
                best_fitness = fi

            # Save model
            save = (not opt.nosave) or (final_epoch and not opt.evolve)
            if save:
                with open(results_file, 'r') as f:  # create checkpoint
                    ckpt = {
                        'epoch':
                        epoch,
                        'best_fitness':
                        best_fitness,
                        'training_results':
                        f.read(),
                        'model':
                        ema.ema,
                        'optimizer':
                        None if final_epoch else optimizer.state_dict(),
                        'wandb_id':
                        wandb_run.id if wandb else None
                    }

                # Save last, best and delete
                torch.save(ckpt, last)
                if best_fitness == fi:
                    torch.save(ckpt, best)
                del ckpt
        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    if rank in [-1, 0]:
        # Strip optimizers
        final = best if best.exists() else last  # final model
        for f in [last, best]:
            if f.exists():
                strip_optimizer(f)  # strip optimizers
        if opt.bucket:
            os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload

        # Plots
        if plots:
            plot_results(save_dir=save_dir)  # save as results.png
            if wandb:
                files = [
                    'results.png', 'precision_recall_curve.png',
                    'confusion_matrix.png'
                ]
                wandb.log({
                    "Results": [
                        wandb.Image(str(save_dir / f), caption=f)
                        for f in files if (save_dir / f).exists()
                    ]
                })
                if opt.log_artifacts:
                    wandb.log_artifact(artifact_or_path=str(final),
                                       type='model',
                                       name=save_dir.stem)

        # Test best.pt
        logger.info('%g epochs completed in %.3f hours.\n' %
                    (epoch - start_epoch + 1, (time.time() - t0) / 3600))
        if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
            for conf, iou, save_json in ([0.25, 0.45,
                                          False], [0.001, 0.65,
                                                   True]):  # speed, mAP tests
                results, _, _ = test.test(opt.data,
                                          batch_size=total_batch_size,
                                          imgsz=imgsz_test,
                                          conf_thres=conf,
                                          iou_thres=iou,
                                          model=attempt_load(final,
                                                             device).half(),
                                          single_cls=opt.single_cls,
                                          dataloader=testloader,
                                          save_dir=save_dir,
                                          save_json=save_json,
                                          plots=False)

    else:
        dist.destroy_process_group()

    wandb.run.finish() if wandb and wandb.run else None
    torch.cuda.empty_cache()
    return results
Пример #21
0
print('cropSize: ' + str(params['cropSize']))
params['imgSize'] = params['cropSize']

# Use CUDA
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
use_cuda = torch.cuda.is_available()
# args.gpu_id = os.getenv('CUDA_VISIBLE_DEVICES')
print(args.gpu_id)

import visdom
vis = None
if args.visdom:
    vis = visdom.Visdom(server=args.server, port=8095, env='main_davis_viz1')
    vis.close()
    import wandb
    wandb.init(project='palindromes')
    vis.close()

# Random seed
if args.manualSeed is None:
    args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
torch.manual_seed(args.manualSeed)
if use_cuda:
    torch.cuda.manual_seed_all(args.manualSeed)


class Wrap(nn.Module):
    def __init__(self, model):
        super(Wrap, self).__init__()
        self.model = model
Пример #22
0
        name = run_name
    artifact = wandb.Artifact(f'{name}-model', 'model')
    for f in os.listdir(path):
        if f.startswith('wandb-'):
            continue  # noqa: 701
        if f == 'output.log':
            continue  # noqa: 701
        if f == 'requirements.txt':
            continue  # noqa: 701
        if f.startswith('events.'):
            continue  # noqa: 701
        if os.path.isdir(os.path.join(path, f)):
            continue  # noqa: 701
        artifact.add_file(os.path.join(path, f), f)
    wandb.run.log_artifact(artifact, aliases=['latest', run_name])


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('run')
    parser.add_argument('--name', default=None, help='artifact name')
    args = parser.parse_args()
    run = args.run
    root = pull_model(run)
    setup_logging()
    logger = logging.getLogger()
    logger.info('publishing artifact')
    wandb.init(resume=run)
    publish_model(root, args.name)
    logger.info('model published')
Пример #23
0
def test_resume_auto_success(live_mock_server, test_settings):
    run = wandb.init(reinit=True, resume=True, settings=test_settings)
    run.join()
    assert not os.path.exists(test_settings.resume_fname)
Пример #24
0
import tensorflow as tf  # pylint: disable=no-name-in-module
from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, Lambda, MaxPooling2D, Reshape, Input, CuDNNGRU, TimeDistributed
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow.keras.backend as K
from datasets import LinesDataset, Generator
from util import ctc_decode, format_batch_ctc, slide_window, ExampleLogger
import wandb

wandb.init()
wandb.config.model = "cnn"
wandb.config.window_width = 14
wandb.config.window_stride = 7

# Load our dataset
dataset = LinesDataset(subsample_fraction=1)
dataset.load_or_generate_data()
image_height, image_width = dataset.input_shape
output_length, num_classes = dataset.output_shape

model = Sequential()
model.add(
    Reshape((image_height, image_width, 1), input_shape=dataset.input_shape))
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D())
model.add(Dropout(0.3))

# We are going to use a Conv2D to slide over these outputs with window_width and window_stride,
# and output softmax activations of shape (output_length, num_classes)./
# In your calculation of the necessary filter size,
Пример #25
0
def test_resume_must_failure(live_mock_server, test_settings):
    with pytest.raises(wandb.Error) as e:
        wandb.init(reinit=True, resume="must", settings=test_settings)
    assert "resume='must' but run" in e.value.message
Пример #26
0
import gym.wrappers

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import Callback
import random
import wandb

ENV_NAME = 'LunarLander-v2'

wandb.init(project="KerasDQN", name="Performance")

env = gym.make(ENV_NAME)
# To get repeatable results.
sd = 16
np.random.seed(sd)
random.seed(sd)
env.seed(sd)
nb_actions = env.action_space.n

env = gym.wrappers.Monitor(env, './monitor', force=True)

model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(40))
model.add(Activation('relu'))
Пример #27
0
            os.makedirs('./results/' + dset + '/global/3days')
        if not os.path.exists('./models/' + dset + '_models'):
            os.makedirs('./models/' + dset + '_models')
    if HORIZON == 24:
        proj_name = 'dayahead'
        if not os.path.exists('./results/' + dset + '/global/dayahead'):
            os.makedirs('./results/' + dset + '/global/dayahead')
        if not os.path.exists('./models/' + dset + '_models'):
            os.makedirs('./models/' + dset + '_models')
    # 1️⃣ Start a new run, tracking config metadata
    run = wandb.init(project=proj_name,
                     config={
                         'layers': LAYERS,
                         'dropout': DROPOUT,
                         'neurons': NEURONS,
                         'learning rate': LR,
                         'batch_size': BATCHSIZE,
                         "architecture": "global",
                         "dataset": dset,
                         "epochs": MAX_EPOCHS,
                         'patience': PATIENCE
                     })
    config = wandb.config

    # full data LSTM MIMO compilation and fit
    LSTMIMO = build_model(l=LAYERS, drop=DROPOUT, n=NEURONS, lr=LR)

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=PATIENCE,
                                                      mode='min')

    history = LSTMIMO.fit(global_inputs_X,
Пример #28
0
    def __init__(
            self,
            up_model: nn.Module,
            down_layer: nn.Module = None,
            train_dataset=None,
            dev_dataset=None,
            dev_evaluator=None,
            epochs: int = 1,
            visiable_device: str = "0",
            scheduler: str = 'warmuplinear',
            warmup_ratio: float = 0.1,
            optimizer_class: Type[Optimizer] = transformers.AdamW,
            optimizer_params: Dict[str, object] = {
                'lr': 5e-5,
                'eps': 1e-6,
                'correct_bias': False
            },
            weight_decay: float = 0.01,
            early_stop: int = 20,
            # 20 evaluation steps without improving on the early_stop_on metric as specified in dev_evaluator
            evaluation_steps: int = 500,
            output_path: str = None,
            save_best_model: bool = True,
            max_grad_norm: float = 1,
            fp16: bool = False,
            accumulation_steps=1,
            fp16_opt_level: str = 'O1',
            seed: int = 122,
            data_loader_shuffle=True,
            device: str = None,
            dev_batch_size: int = -1,  # the same as train_batch_size
            n_gpu: int = None,
            report_model: bool = True,
            per_gpu_train_batch_size: int = 8,
            restore_training: bool = False,
            local_rank: int = -1,
            wandb_config=None):
        """
        this trainer is written for training a sequential model that contains an upstream_layer (usually transformers)
        and a downstream_layer (usually task-specific heads like FF, RNN, CNN for encoding the output of upstram_layer)

        :param up_model: transformers like transformers.GPT2LMHeadModel or transformers.BERTModel
        :param down_layer: None if up_model already wraps up with an output encoder such as LMHead in GPT2LMHeadModel, else nn.Module for encoding the output of up_model
        :param train_dataset: train_dataset, it can be either instance of torch.data.Dataset or IterableDataset (defined in data.py)
        :param dev_dataset: dev_dataset, it can be either instance of torch.data.Dataset or IterableDataset
        :param dev_evaluator: dev_evaluator, evaluator on dev_dataset for early stop and performance tracking during training (defined in evaluate.py)
        :param epochs: number of epoches for training
        :param visiable_device: devices chosen to perform training
        :param scheduler: scheduler specially from transformers: see options in self._get_scheduler
        :param warmup_ratio: warmup_ratio ratio for learning rate over total training steps
        :param optimizer_class: transformers.AdamW de byfault
        :param optimizer_params: optimizer params
        :param weight_decay:weight decay
        :param early_stop:early stop steps
        :param evaluation_steps:logging steps
        :param output_path: path to save the checkpoint with the best performance as specified in early_stop_on in dev_evaluator instance
        :param save_best_model:save best checkpoint or the latest checkpoint
        :param max_grad_norm:max grad norm
        :param fp16: fp16 training
        :param accumulation_steps:accumulation steps
        :param fp16_opt_level:fp16 opt level
        :param seed:random seed for reproducibility
        :param data_loader_shuffle:Whether to shuffle data_loader of training dataset and dev dataset after epoch ends
        :param device: device for training, None or gpu for gpu training, cpu for gpu training
        :param dev_batch_size: development batch size, usually larger than training batch size due to no grads calculation and hence less burden on memory
        :param n_gpu: number of gpus for training
        :param report_model:if report model's structure and number of trainable params in logging
        :param per_gpu_train_batch_size: what it means literally
        :param restore_training: if restore training if the training process is interupped due to some accidents
        :param local_rank:for distributed training
        :param wandb_config: wandb logging if not none, else without wandb logging
        """

        self.up_model = up_model
        if down_layer == None:
            # In this example, the upstream_layer already integrate the downstream head (namely, simple LM head as in transformers.GPT2LMHeadModel)
            # EmptyHeads is created here only for placeholder purpose
            down_layer = EmptyHeads()

        self.down_layer = down_layer
        assert output_path != None
        output_path = os.path.join("tmp", output_path)
        # os.makedirs(output_path,exist_ok=True)
        if restore_training:
            if not os.listdir(output_path):
                raise ValueError(f"no checkpoint found in {output_path}")
            else:
                logger.info(
                    "   loading embedding weights from saved checkpoint")
                self.up_model = self.up_model.reload(
                    output_path
                )  # for other transformers (apart from bert), the load_saved function has not been added

                logger.info(
                    "   loading downstream weights from saved checkpoint")
                self.down_layer.load_saved(output_path)
                with open(output_path + "/ck_report.json") as f:
                    self.ck_report = json.load(f)

        self.model = torch.nn.Sequential(self.up_model, self.down_layer)

        if is_wandb_available() and wandb_config != None:
            # keep track of model topology and gradients if is_wandb_available and args!=None
            wandb.init(project=wandb_config.wandb_project_name,
                       config=wandb_config,
                       name=wandb_config.wandb_run_name)
            wandb.watch((self.up_model, self.down_layer),
                        log_freq=max(100, evaluation_steps))
        self.wandb_config = wandb_config

        self._restore_training = restore_training
        self.early_stop = early_stop

        self._dev_evaluator = dev_evaluator

        self._evaluation_steps = evaluation_steps
        self._save_best_model = save_best_model
        self._max_grad_norm = max_grad_norm

        os.makedirs(output_path, exist_ok=True)
        if os.listdir(output_path) and not restore_training:
            out = input(
                "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)"
                .format(output_path))
            if out == "y":
                shutil.rmtree(output_path)
                os.makedirs(output_path, exist_ok=True)
            else:
                raise ValueError(
                    "Output directory ({}) already exists and is not empty".
                    format(output_path))

        logFormatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        fileHandler = logging.FileHandler(os.path.join(output_path, "log.out"),
                                          mode="a")
        fileHandler.setFormatter(logFormatter)
        logger.addHandler(fileHandler)
        self._dev_evaluator.reset_logger(output_path)

        self.output_path = output_path

        if device is None or device == "cuda":
            if torch.cuda.is_available():
                device = torch.device("cuda")
                n_gpu = 1 if n_gpu == 1 else torch.cuda.device_count()
            else:
                logger.warning("no cuda is found in your machine, now use cpu")
                device = torch.device("cpu")
                n_gpu = 0
        elif device == "cpu":
            device = torch.device("cpu")
            n_gpu = 0
        else:
            raise ValueError("set device to be None, cuda or cpu")
        assert n_gpu <= torch.cuda.device_count()

        logger.info("Use pytorch device: {}, with gpu_number={}".format(
            device, n_gpu))

        self._train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
        self._dev_batch_size = dev_batch_size if dev_batch_size != -1 else self._train_batch_size

        if isinstance(train_dataset, data.IterableDataset):
            self._train_dataloader = DataLoader(train_dataset, batch_size=None)
            self._steps_per_epoch = len(self._train_dataloader.dataset)
        else:
            self._train_dataloader = DataLoader(
                train_dataset,
                shuffle=data_loader_shuffle,
                batch_size=self._train_batch_size)
            self._steps_per_epoch = len(self._train_dataloader)

        if isinstance(dev_dataset, data.IterableDataset):
            dev_dataloader = DataLoader(dev_dataset, batch_size=None)
        else:
            dev_dataloader = DataLoader(dev_dataset,
                                        shuffle=data_loader_shuffle,
                                        batch_size=self._dev_batch_size)

        if accumulation_steps > 1:
            self._steps_per_epoch = self._steps_per_epoch // accumulation_steps

        self._dev_data = dev_dataset
        self._dev_evaluator.reset_dataloader(dev_dataloader)

        self.collate_fn = CollateFunction(self.up_model)
        # Use customize batching
        self._train_dataloader.collate_fn = self.collate_fn

        self._train_data = train_dataset
        self._per_gpu_train_batch_size = per_gpu_train_batch_size

        set_seed(seed, n_gpu)

        if n_gpu > 1:
            self.model = torch.nn.DataParallel(
                self.model,
                device_ids=[int(i) for i in visiable_device.split(',')])
            self.model = self.model.to(f'cuda:{self.model.device_ids[0]}')

        elif n_gpu == 1:
            self.model = self.model.to(device)

        self._device = device
        self._n_gpu = n_gpu

        self._total_train_steps = int(self._steps_per_epoch * epochs)
        self._epochs = epochs

        if report_model:
            count_params(self.model, print_details=True)

        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if local_rank != -1:
            self._total_train_steps = self._total_train_steps // torch.distributed.get_world_size(
            )

        self._optimizer = optimizer_class(optimizer_grouped_parameters,
                                          **optimizer_params)

        warmup_steps = math.ceil(
            self._total_train_steps *
            warmup_ratio)  # by default 20% of train data for warm-up
        logger.info(f"   Warmup-steps: {warmup_steps}")

        self._scheduler = self._get_scheduler(
            self._optimizer,
            scheduler=scheduler,
            warmup_steps=warmup_steps,
            num_total=self._total_train_steps)

        if fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(self.model,
                                              self._optimizer,
                                              opt_level=fp16_opt_level)
            self.model = model
            self._optimizer = optimizer

        self._fp16 = fp16
        tb_writer = None
        if local_rank in [-1, 0]:
            tb_writer = SummaryWriter()
        self._tb_writer = tb_writer
        self._local_rank = local_rank
        self._best_score = -float("inf")
        self._early_stop_count = 0
        self.last_time = datetime.now()
        self.accumulation_steps = accumulation_steps
Пример #29
0
        fps=29)
    optimize('/tmp/current_gif.gif')

    return visited_pos, visited_vel, acts, means, stds, vals


def net_layers(hidden):
    if env_type == 'DISCRETE':
        act_space = env.action_space.n
    else:
        act_space = env.action_space.shape[0]
    obs_space = env.observation_space.shape[0]
    return [obs_space] + hidden + [act_space]


wandb.init(entity="agkhalil", project="pytorch-ac-mountaincarcont")
wandb.watch_called = False

config = wandb.config
config.batch_size = 50
config.episodes = 10000
config.lr_ac = 0.005
config.lr_cr = 0.00005
config.seed = 42
config.gamma = 0.99
eps = np.finfo(np.float32).eps.item()

device = torch.device('cpu')
torch.manual_seed(config.seed)
lr_ac = config.lr_ac
lr_cr = config.lr_cr
Пример #30
0
 def __init__(self, **kwargs):
     super(WandbReporting, self).__init__(**kwargs)
     """"
     first do `wandb init` on terminal.
     """
     wandb.init()
Пример #31
0
 def _init(self):
     self._config = None
     wandb.init(**self.config.get("env_config", {}).get("wandb", {}))