Пример #1
0
def data_sender(id, name, *args):
    context = zmq.Context()
    sender = context.socket(zmq.PUSH)
    sender.connect('ipc://@{}'.format(name))

    print('start data provider {}-{}'.format(name, id))
    while True:
        data_iter = dataset.train_dataset(id + 1)
        for msg in data_iter:
            # print(id)
            sender.send(dumps([id, msg]))
Пример #2
0
def train(params):
    total_nr_iters = config.train_base_iters
    batch_per_gpu = config.train_batch_per_gpu
    base_lr = config.base_lr
    line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters)
    print(line)
    # set model save path and log save path
    saveDir = config.model_dir
    misc_utils.ensure_dir(saveDir)
    fpath = os.path.join(config.output_dir, line+'.log')
    fid_log = open(fpath,'a')
    # set data input pipe
    program_name = config.program_name
    # check gpus
    torch.set_default_tensor_type('torch.FloatTensor')
    if not torch.cuda.is_available():
        print('No GPU exists!')
        return
    else:
        num_gpus = torch.cuda.device_count()
        train_iter = total_nr_iters//(num_gpus*batch_per_gpu)
        train_lr_decay = np.array(config.lr_decay)//(num_gpus*batch_per_gpu)
        train_dump_interval = config.model_dump_interval//(num_gpus*batch_per_gpu)
    train_lr = base_lr * num_gpus
    bt_size = num_gpus * batch_per_gpu
    line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\
            train_iter:{}, decay_iter:{}, dump_interval:{}'.format(
            num_gpus,train_lr,bt_size,train_iter,train_lr_decay, train_dump_interval)
    print(line)
    print("Building netowrk.")
    net = network.Network()
    # Moves all model parameters and buffers to the GPU.
    net.cuda()
    if params.resume_weights:
        model_file = os.path.join(saveDir, 'dump-{}.pth'.format(params.resume_weights))
        check_point = torch.load(model_file)
        net.load_state_dict(check_point['state_dict'])
    net = nn.DataParallel(net)
    # set the optimizer, use momentum and weight_decay
    optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \
        weight_decay=config.weight_decay)
    # check if resume training
    training_data = train_dataset()

    net.train()

    if(params.progressbar):
        tqdm.monitor_interval = 0
        pbar = tqdm(total=train_iter, leave=False, ascii=True)

    dump_num = 1
    start_iter = 0
    if params.resume_weights:
        start_iter = int(params.resume_weights) * train_dump_interval
        if(start_iter >= train_lr_decay[0]):
            optimizer.param_groups[0]['lr'] = train_lr / 10
        if(start_iter >= train_lr_decay[1]):
            optimizer.param_groups[0]['lr'] = train_lr / 100
        dump_num = int(params.resume_weights) + 1

    for step in range(start_iter, train_iter):
        # warm up
        if step < config.warm_iter:
            alpha = step / config.warm_iter
            lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr
            optimizer.param_groups[0]['lr'] = lr_new
        elif step == config.warm_iter:
            optimizer.param_groups[0]['lr'] = train_lr
        if step == train_lr_decay[0]:
            optimizer.param_groups[0]['lr'] = train_lr / 10
        elif step == train_lr_decay[1]:
            optimizer.param_groups[0]['lr'] = train_lr / 100
        # get training data
        images, gt_boxes, img_info = process(training_data, num_gpus)
        optimizer.zero_grad()
        # forwad
        outputs = net(images, img_info, gt_boxes)
        # collect the loss
        total_loss = sum([outputs[key].mean() for key in outputs.keys()])
        total_loss.backward()
        optimizer.step()
        if(params.progressbar):
            pbar.update(1)
        # stastic
        if step % config.log_dump_interval == 0:
            stastic_total_loss = total_loss.cpu().data.numpy()
            line = 'Iter {}: lr:{:.5f}, loss is {:.4f}.'.format(
                step, optimizer.param_groups[0]['lr'], stastic_total_loss)
            print(outputs)
            print(line)
            fid_log.write(line+'\n')
            fid_log.flush()
        # save the model
        if (step + 1)%train_dump_interval==0:
            fpath = os.path.join(saveDir,'dump-{}.pth'.format(dump_num))
            dump_num += 1
            model = dict(epoch = step,
                state_dict = net.module.state_dict(),
                optimizer = optimizer.state_dict())
            torch.save(model,fpath)

    if(params.progressbar):
        pbar.close()

    fid_log.close()
Пример #3
0
def worker(rank, gpu_num, args):
    # using sublinear
    os.environ[
        "MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0"
    os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '10'
    os.environ['MGB_CUDA_RESERVE_MEMORY'] = '1'
    # establish the server if is the master

    dist_port = args.port
    if rank == 0:
        dist.Server(port=dist_port)
    if gpu_num > 1:

        dist.init_process_group(
            master_ip="localhost",
            port=dist_port,
            world_size=gpu_num,
            rank=rank,
            device=rank,
        )
        logger.info("Init process group for gpu%d done", rank)

    model = network.Network()
    params = model.parameters(requires_grad=True)
    model.train()

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=allreduce_cb,
    )

    opt = optim.SGD(
        params,
        lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu,
        momentum=cfg.momentum,
        weight_decay=cfg.weight_decay,
    )

    if cfg.pretrain_weight is not None:
        weights = mge.load(cfg.pretrain_weight)
        del weights['fc.weight']
        del weights['fc.bias']
        model.resnet50.load_state_dict(weights)

    start_epoch = 0
    if args.resume_weights is not None:
        assert osp.exists(args.resume_weights)
        model_file = args.resume_weights
        model_dict = mge.load(model_file)
        start_epoch, weights = model_dict['epoch'] + 1, model_dict[
            'state_dict']
        model.load_state_dict(weights, strict=False)

    logger.info("Prepare dataset")
    train_loader = dataset.train_dataset(rank)
    logger.info("Training...")
    for epoch_id in range(start_epoch, cfg.max_epoch):
        for param_group in opt.param_groups:
            param_group["lr"] = (cfg.basic_lr * gpu_num * cfg.batch_per_gpu *
                                 (cfg.lr_decay_rate**bisect.bisect_right(
                                     cfg.lr_decay_sates, epoch_id)))

        max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num)
        train_one_epoch(model, gm, train_loader, opt, max_steps, rank,
                        epoch_id, gpu_num)
        if rank == 0:
            save_path = osp.join(cfg.model_dir,
                                 'epoch-{}.pkl'.format(epoch_id + 1))
            state_dict = model.state_dict()
            names = [k for k, _ in state_dict.items()]
            for name in names:
                if name.startswith('inputs.'):
                    del state_dict[name]

            mge.save(
                {
                    "epoch": epoch_id,
                    "state_dict": state_dict
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
Пример #4
0
)  # Binary Cross-Entropy Loss with sigmoid attached in front.
""" 
==============================================
Begin Training
==============================================
"""
lr = args.lr
for epoch in range(args.start_epoch, args.epochs):
    # I will assume the graphs are shuffled somehow when calling train_dataset().

    batch_index = 0  # More like batch counter, counting the number of distinct (conjecutre, statement) pairs are stored.
    batch_number = 0  # Number of batches iterated.
    conjecture_state_batch = []
    label_batch = []

    for datapoint in train_dataset():
        # Collect datapoints for inter-graph batching.

        if (epoch
                == args.start_epoch) and (batch_number < args.start_batch - 1):
            # If starting from a saved <start_epoch> and <start_batch>, pass through prior datapoints.
            batch_index += 1
            if batch_index < args.batch_size:
                continue
            else:
                batch_index = 0
                batch_number += 1
            continue

        conjecture_graph = datapoint.conjecture
        statement_graph = datapoint.statement
pre_trained = False

torch.cuda.set_device(0)

try:
    os.makedirs(out_file)
    os.makedirs(out_file + '/model/')
except OSError:
    pass

manual_seed = random.randint(1, 10000)
random.seed(manual_seed)
torch.manual_seed(manual_seed)
cudnn.benchmark = True

train_datatset_ = train_dataset(data_path, size_w, size_h, flip, time_series)
val_datatset_ = train_dataset(val_path, size_w, size_h, 0, time_series)


def weights_init(m):
    class_name = m.__class__.__name__
    if class_name.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
        m.bias.data.fill_(0)
    elif class_name.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

try:
    os.makedirs(out_file)
    os.makedirs(out_file + '/model/')
Пример #6
0
num_GPU = 1
index = 100
torch.cuda.set_device(0)

try:
    import os
    os.makedirs(out_file)
except OSError:
    pass

manual_seed = random.randint(1, 10000)
random.seed(manual_seed)
torch.manual_seed(manual_seed)
cudnn.benchmark = True

train_datatset_ = train_dataset(train_path, size_w, size_h, flip, band,
                                batch_size)
val_datatset_ = train_dataset(val_path, size_w, size_h, 0, band)


def weights_init(m):
    class_name = m.__class__.__name__
    if class_name.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
        m.bias.data.fill_(0)
    elif class_name.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)


try:
    os.makedirs(out_file)
Пример #7
0
def train(args):
    if type(config.train_source) == list:
        training_data = multi_train_dataset(args)
    else:
        training_data = train_dataset(args)
    number_of_training_instances = training_data.__next__()
    val_data = eval_dataset(args)
    number_of_val_instances = val_data.__next__()

    total_nr_iters = args.epochs * number_of_training_instances
    batch_per_gpu = config.train_batch_per_gpu

    base_lr = config.base_lr
    line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters)

    print(line)

    # set model save path and log save path
    saveDir = config.model_dir
    misc_utils.ensure_dir(saveDir)

    # set data input pipe
    program_name = config.program_name
    # check gpus
    torch.set_default_tensor_type('torch.FloatTensor')
    if not torch.cuda.is_available():
        print('No GPU exists!')
        return
    else:
        num_gpus = torch.cuda.device_count()

        train_iter = total_nr_iters // (num_gpus * batch_per_gpu)

        print('[-]', num_gpus, batch_per_gpu, total_nr_iters)

        new_decay = (np.array(config.lr_decay) / 450000) * total_nr_iters

        train_lr_decay = new_decay // (num_gpus * batch_per_gpu)

        train_dump_interval = number_of_training_instances // (num_gpus *
                                                               batch_per_gpu)

    train_lr = base_lr * num_gpus
    bt_size = num_gpus * batch_per_gpu

    line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\
            train_iter:{}, decay_iter:{}, dump_interval:{}'.format(
        num_gpus, train_lr, bt_size, train_iter, train_lr_decay,
        train_dump_interval)
    print(line)

    print("[-]Building netowrk.")
    net = network.Network(args)
    net.cuda()

    best = 10e10
    epoch = 0
    if args.resume:
        print("Load base model from :",
              os.path.join(args.save_dir, args.output_name, 'dump_last.pth'))
        check_point = torch.load(
            os.path.join(args.save_dir, args.output_name, 'dump_last.pth'))
        net.load_state_dict(check_point['state_dict'])
        start_iter = check_point['step']
        if 'val_loss' in check_point:
            best = check_point['val_loss']
        epoch = start_iter // train_dump_interval + 1
    elif args.base_model:
        print("Load base model from :", args.base_model)
        check_point = torch.load(args.base_model)
        net.load_state_dict(check_point['state_dict'], strict=False)
        start_iter = 0
    else:
        start_iter = 0

    net = nn.DataParallel(net)
    # set the optimizer, use momentum and weight_decay
    optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \
        weight_decay=config.weight_decay)

    if (start_iter >= train_lr_decay[0]):
        optimizer.param_groups[0]['lr'] = train_lr / 10
    if (start_iter >= train_lr_decay[1]):
        optimizer.param_groups[0]['lr'] = train_lr / 100

    # check if resume training
    net.train()
    logger = Logger(args)

    iter_tqdm = None
    val_tqdm = None
    for step in range(start_iter, train_iter):
        # warm up
        if step < config.warm_iter:
            alpha = step / config.warm_iter
            lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr
            optimizer.param_groups[0]['lr'] = lr_new
        elif step == config.warm_iter:
            optimizer.param_groups[0]['lr'] = train_lr
        if step == train_lr_decay[0]:
            optimizer.param_groups[0]['lr'] = train_lr / 10
        elif step == train_lr_decay[1]:
            optimizer.param_groups[0]['lr'] = train_lr / 100
        # get training data
        images, gt_boxes, img_info, done_an_epoch, extra = process(
            args, training_data, num_gpus)
        if done_an_epoch:
            epoch += 1
        optimizer.zero_grad()
        # forward
        outputs = net(images, img_info, gt_boxes, extra=extra)
        # collect the loss
        total_loss = sum([outputs[key].mean() for key in outputs.keys()])
        total_loss.backward()
        optimizer.step()

        # stastic
        stastic_total_loss = total_loss.cpu().data.numpy()
        line = '[*]Epoch:{} iter<{}> lr:{:.5f}, loss:{:.4f}'.format(
            epoch, step, optimizer.param_groups[0]['lr'],
            float(stastic_total_loss))

        if step % config.log_dump_interval == 0:
            logger.scalar_summary('lr', optimizer.param_groups[0]['lr'], step)
            for k, v in outputs.items():
                v = float(np.mean(v.cpu().data.numpy()))
                logger.scalar_summary(k, v, step)
                line += ', ' + k + ':{:.4}'.format(v)
            logger.scalar_summary('total_loss', float(stastic_total_loss),
                                  step)
        else:
            for k, v in outputs.items():
                v = float(np.mean(v.cpu().data.numpy()))
                line += ', ' + k + ':{:.4}'.format(v)
        if iter_tqdm is None:
            iter_tqdm = tqdm(total=train_iter, desc='Iteration')
            iter_tqdm.update(start_iter)
        iter_tqdm.set_description("[-] " + line)
        iter_tqdm.refresh()
        # save the best model
        if done_an_epoch:
            if args.save_per_epoch > 0:
                if (epoch + 1) % args.save_per_epoch == 0:
                    fpath = os.path.join(saveDir, 'dump_{}.pth'.format(epoch))
                    print('[.] Saving :', fpath)
                    model = dict(epoch=epoch,
                                 step=step,
                                 state_dict=net.module.state_dict(),
                                 optimizer=optimizer.state_dict())
                    torch.save(model, fpath)

            fpath = os.path.join(saveDir, 'dump_last.pth')
            print('[.] Saving :', fpath)
            model = dict(epoch=epoch,
                         step=step,
                         state_dict=net.module.state_dict(),
                         optimizer=optimizer.state_dict())
            torch.save(model, fpath)

        net.train()

        iter_tqdm.update(1)
    iter_tqdm.close()

    fpath = os.path.join(saveDir, 'dump_last.pth')
    print('[.] Saving :', fpath)
    model = dict(step=step,
                 state_dict=net.module.state_dict(),
                 optimizer=optimizer.state_dict())
    torch.save(model, fpath)
def load_weights(name, model):
    path = '/models/{}/weights.h5'.format(name)
    if os.path.exists(path):
        images, _ = next(iter(train_dataset().batch(10)))
        model(images)
        model.load_weights(path)