示例#1
0
文件: __init__.py 项目: Agnon1573/qb
class Tensorboard(Callback):
    def __init__(self,
                 experiment_name: str,
                 hostname=QB_TB_HOSTNAME,
                 port=QB_TB_PORT):
        if host_is_up(hostname, port):
            from pycrayon import CrayonClient
            self.client = CrayonClient(hostname=hostname, port=port)
            self.experiment_name = experiment_name
            try:
                self.client.remove_experiment(experiment_name)
            except ValueError:
                pass
            self.experiment = self.client.create_experiment(experiment_name)
        else:
            log.info(
                f'Tensorboard not found on http://{hostname}:{port}, experiment logging disabled'
            )
            self.client = None
            self.experiment_name = None
            self.experiment = None

    def on_epoch_end(self, logs):
        if self.client is not None:
            self.experiment.add_scalar_value('train_loss',
                                             logs['train_loss'][-1])
            self.experiment.add_scalar_value('train_acc',
                                             logs['train_acc'][-1])
            self.experiment.add_scalar_value('test_loss',
                                             logs['test_loss'][-1])
            self.experiment.add_scalar_value('test_acc', logs['test_acc'][-1])
            self.experiment.add_scalar_value('train_time',
                                             logs['train_time'][-1])
示例#2
0
 def test_remove_experiment(self):
     cc = CrayonClient(port=self.test_server_port)
     self.assertRaises(ValueError, cc.open_experiment, "foo")
     foo = cc.create_experiment("foo")
     foo.add_scalar_value("bar", 1, step=2, wall_time=0)
     self.assertRaises(ValueError, cc.create_experiment, "foo")
     cc.open_experiment("foo")
     cc.remove_experiment(foo.xp_name)
     self.assertRaises(ValueError, cc.remove_experiment, foo.xp_name)
     foo = cc.create_experiment("foo")
示例#3
0
def tensorboard():
    '''
    '''
    from pycrayon import CrayonClient
    cc = CrayonClient(hostname=TENSORBOARD_SERVER)
    try:
        cc.remove_experiment(EXP_NAME)
    except:
        pass
    foo = cc.create_experiment(EXP_NAME)
示例#4
0
def setup_tensorboard(exp_id, cur_t, hostname, port):
    exp_filename = '{}_{}'.format(cur_t, exp_id)
    tb = CrayonClient(hostname=hostname, port=port)
    try:
        tb_experiment = tb.create_experiment(exp_filename)
    except:
        # flush the data anew
        tb.remove_experiment(exp_filename)
        tb_experiment = tb.create_experiment(exp_filename)
    return tb_experiment, tb
示例#5
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='umt.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.model_opts(parser)
    opts.preprocess_opts(parser)
    opts.train_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    if opt.word_vec_size != -1:
        opt.src_word_vec_size = opt.word_vec_size
        opt.tgt_word_vec_size = opt.word_vec_size

    if opt.layers != -1:
        opt.enc_layers = opt.layers
        opt.dec_layers = opt.layers

    opt.brnn = (opt.encoder_type == "brnn")

    # if opt.seed > 0:
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    if torch.cuda.is_available() and not opt.gpuid:
        print("WARNING: You have a CUDA device, should run with -gpuid 0")

    if opt.gpuid:
        cuda.set_device(opt.gpuid[0])
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    if len(opt.gpuid) > 1:
        sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n")
        sys.exit(1)

    # Set up the Crayon logging server.
    if opt.exp_host != "":
        from pycrayon import CrayonClient

        cc = CrayonClient(hostname=opt.exp_host)

        experiments = cc.get_experiment_names()
        print(experiments)
        if opt.exp in experiments:
            cc.remove_experiment(opt.exp)

    return opt
示例#6
0
class Tensorboard(Callback):
    def __init__(self, experiment_name: str, log_dir=None):
        from pycrayon import CrayonClient
        self.client = CrayonClient(port=6007)
        self.experiment_name = experiment_name
        try:
            self.client.remove_experiment(experiment_name)
        except ValueError:
            pass
        self.experiment = self.client.create_experiment(experiment_name)

    def on_epoch_end(self, logs):
        self.experiment.add_scalar_value('train_loss', logs['train_loss'][-1])
        self.experiment.add_scalar_value('train_acc', logs['train_acc'][-1])
        self.experiment.add_scalar_value('test_loss', logs['test_loss'][-1])
        self.experiment.add_scalar_value('test_acc', logs['test_acc'][-1])
        self.experiment.add_scalar_value('train_time', logs['train_time'][-1])
示例#7
0
def create_crayon_logger(exp_name, port=8889):
    """
    """
    # Connect Crayon Logger (TensorBoard "wrapper") to the server
    cc = CrayonClient(hostname="localhost", port=port)
    tb_log_exp_name = exp_name
    # Remove previous experiment
    try:
        cc.remove_experiment(tb_log_exp_name)
    except ValueError:
        # experiment doesn't already exist - nothing to be done here
        print("Experiment '{}' didn't exist already (nothing to be done).".format(\
                tb_log_exp_name))
    # Create a new experiment
    tb_log = cc.create_experiment(tb_log_exp_name)
    return tb_log
    
示例#8
0
class Monitor(object):
    def __init__(self, address, port):
        self.cc = CrayonClient(hostname=address, port=port)

    def start_experiment(self, name, clean=True):
        exps = self.cc.get_experiment_names()
        if name in exps:
            if clean:
                self.cc.remove_experiment(name)
                self.exp = self.cc.create_experiment(name)
                print 'clean and creat a new one'
            else:
                self.exp = self.cc.open_experiment(name)
        else:
            self.exp = self.cc.create_experiment(name)

    def push(self, data, wall_time=-1, step=-1):
        self.exp.add_scalar_dict(data, wall_time, step)
示例#9
0
    def test_backup(self):
        cc = CrayonClient(port=self.test_server_port)
        foo = cc.create_experiment("foo")
        foo.add_scalar_value("bar", 2, wall_time=time.time(), step=1)
        foo.add_scalar_value("bar", 2, wall_time=time.time(), step=2)
        foo_data = foo.get_scalar_values("bar")
        filename = foo.to_zip()

        cc.remove_experiment("foo")

        foo = cc.create_experiment("foo", zip_file=filename)
        new_data = foo.get_scalar_values("bar")
        self.assertEqual(foo_data, new_data)

        new = cc.create_experiment("new", zip_file=filename)
        new_data = new.get_scalar_values("bar")
        self.assertEqual(foo_data, new_data)

        os.remove(filename)
示例#10
0
def get_crayon_experiment(exp_name, hostname='127.0.0.1', overwrite=True):
    cc = CrayonClient(hostname=hostname)

    cc_exp = None

    experiments = cc.get_experiment_names()
    if exp_name in experiments:
        if overwrite:
            cc.remove_experiment(exp_name)
            cc_exp = cc.create_experiment(exp_name)
        else:
            cc_exp = cc.open_experiment(exp_name)
    else:
        try:
            cc_exp = cc.create_experiment(exp_name)
        except ValueError:
            cc.remove_experiment(exp_name)
            cc_exp = cc.create_experiment(exp_name)

    return cc_exp
示例#11
0
def make_crayon_experiments(experiment_name, new=True):
    client = CrayonClient(hostname=config.CRAYON_SERVER_HOSTNAME)
    train_experiment_name = f'{experiment_name}_train'
    valid_experiment_name = f'{experiment_name}_valid'
    if new:
        try:
            client.remove_experiment(train_experiment_name)
        except ValueError:
            pass
        try:
            client.remove_experiment(valid_experiment_name)
        except ValueError:
            pass
        train_experiment = client.create_experiment( train_experiment_name)
        train_experiment.scalar_steps['lr'] = 1
        valid_experiment = client.create_experiment(valid_experiment_name)
    else:
        train_experiment = client.open_experiment(train_experiment_name)
        valid_experiment = client.open_experiment(valid_experiment_name)
    return train_experiment, valid_experiment
def crayon_create_experiment(exp_name: str, cclient: CrayonClient,
                             overwrite: bool = True) -> CrayonExperiment:
    """
    Create experiment name in the alband/crayon tensorboard
    :param exp_name: name of experiment
    :param cclient: handler of requests to crayon
    :param overwrite: if the experiment already exists delete and recreate
    :return:
    """
    
    try:
        ccexp = cclient.create_experiment(exp_name)
        return ccexp

    except ValueError as verr:
        if overwrite:

            cclient.remove_experiment(exp_name)
            ccexp = cclient.create_experiment(exp_name)
            return ccexp
        else:

            raise verr
示例#13
0
def main():
    # Set up the Crayon logging server.
    if opt.log_server != "":
        from pycrayon import CrayonClient
        cc = CrayonClient(hostname=opt.log_server)

        experiments = cc.get_experiment_names()
        print(experiments)
        if opt.experiment_name in experiments:
            cc.remove_experiment(opt.experiment_name)
        opt.experiment_name = cc.create_experiment(opt.experiment_name)

    print("Loading data from '%s'" % opt.data)

    dataset = torch.load(opt.data)
    dict_checkpoint = (opt.train_from if opt.train_from
                       else opt.train_from_state_dict)
    if dict_checkpoint:
        print('Loading dicts from checkpoint at %s' % dict_checkpoint)
        checkpoint = torch.load(dict_checkpoint,
                                map_location=lambda storage, loc: storage)
        #dataset['dicts'] = checkpoint['dicts']

    if opt.redis:
        trainData = onmt.RedisDataset("train", opt.batch_size, False, reverse=opt.reverse, port=opt.port, db=opt.db,
                                      r2l=opt.r2l)
        validData = onmt.RedisDataset('valid', opt.batch_size, False, volatile=True, reverse=opt.reverse, port=opt.port,
                                      r2l=opt.r2l, db=opt.db)
    else:
        trainData = onmt.Dataset(dataset['train']['src'],
                             dataset['train']['tgt'], opt.batch_size, False,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['train'].get('src_features'),
                             tgtFeatures=dataset['train'].get('tgt_features'),
                             alignment=dataset['train'].get('alignments'))
        validData = onmt.Dataset(dataset['valid']['src'],
                             dataset['valid']['tgt'], opt.batch_size, False,
                             volatile=True,
                             data_type=dataset.get("type", "text"),
                             srcFeatures=dataset['valid'].get('src_features'),
                             tgtFeatures=dataset['valid'].get('tgt_features'),
                             alignment=dataset['valid'].get('alignments'))

    dicts = dataset['dicts']
    if opt.reverse:
        dicts['src'], dicts['tgt'] = dicts['tgt'], dicts['src']
        dicts['src_features'], dicts['tgt_features'] = dicts['tgt_features'], dicts['src_features']
    print(' * vocabulary size. source = %d; target = %d' %
          (dicts['src'].size(), dicts['tgt'].size()))
    #if 'src_features' in dicts:
    #    for j in range(len(dicts['src_features'])):
    #        print(' * src feature %d size = %d' %
    #              (j, dicts['src_features'][j].size()))

    #print(' * number of training sentences. %d' %
          #len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    if opt.encoder_type == "text":
        encoder = onmt.Models.Encoder(opt, dicts['src'],
                                      dicts.get('src_features', None))
    elif opt.encoder_type == "img":
        encoder = onmt.modules.ImageEncoder(opt)
        assert("type" not in dataset or dataset["type"] == "img")
    else:
        print("Unsupported encoder type %s" % (opt.encoder_type))

    decoder = onmt.Models.Decoder(opt, dicts['tgt'])

    if opt.copy_attn:
        generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt'])
    else:
        generator = nn.Sequential(
            nn.Linear(opt.rnn_size, dicts['tgt'].size()),
            nn.LogSoftmax())
        if opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight

    model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1)

    if opt.train_from:
        print('Loading model from checkpoint at %s' % opt.train_from)
        chk_model = checkpoint['model']
        generator_state_dict = chk_model.generator.state_dict()
        model_state_dict = {k: v for k, v in chk_model.state_dict().items()
                            if 'generator' not in k}
        model.load_state_dict(model_state_dict)
        generator.load_state_dict(generator_state_dict)
        opt.start_epoch = checkpoint['epoch'] + 1

    if opt.train_from_state_dict:
        print('Loading model from checkpoint at %s'
              % opt.train_from_state_dict)
        model.load_state_dict(checkpoint['model'])
        generator.load_state_dict(checkpoint['generator'])
        opt.start_epoch = checkpoint['epoch'] + 1

    model.cpu()
    generator.cpu()

    model.generator = generator

    if not opt.train_from_state_dict and not opt.train_from:
        if opt.param_init != 0.0:
            print('Intializing params')
            for p in model.parameters():
                p.data.uniform_(-opt.param_init, opt.param_init)

        encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc)
        decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec)

        optim = onmt.Optim(
            opt.optim, opt.learning_rate, opt.max_grad_norm,
            lr_decay=opt.learning_rate_decay,
            start_decay_at=opt.start_decay_at,
            opt=opt
        )
    else:
        print('Loading optimizer from checkpoint:')
        optim = checkpoint['optim']
        print(optim)


    if opt.train_from or opt.train_from_state_dict:
        optim.optimizer.load_state_dict(
            checkpoint['optim'].optimizer.state_dict())

    print('Multi gpu training ', opt.gpus)
    trainer = MultiprocessingTrainer(opt, model, optim, device_ids=opt.gpus)

    nParams = sum([p.nelement() for p in model.parameters()])
    print('* number of parameters: %d' % nParams)
    enc = 0
    dec = 0
    for name, param in model.named_parameters():
        if 'encoder' in name:
            enc += param.nelement()
        elif 'decoder' in name:
            dec += param.nelement()
        else:
            print(name, param.nelement())
    print('encoder: ', enc)
    print('decoder: ', dec)

    trainModel(trainer, trainData, validData, dataset)
示例#14
0
class DDPGOptimizer(object):
    """docstring for DDPGOptimizer"""

    def __init__(self, agent, capacity, batch_size, gamma, tau, init_lr, weight_decay, crayon_vis):
        super(DDPGOptimizer, self).__init__()
        self.agent = agent
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayMemory(capacity, batch_size)
        self.critic_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.agent.critic.parameters(), lr=init_lr[
                                           'critic'], weight_decay=weight_decay)
        self.actor_optimizer = optim.Adam(
            self.agent.actor.parameters(), lr=init_lr['actor'])

        self.crayon_vis = crayon_vis
        if self.crayon_vis:
            self.cc = CrayonClient()
            try:
                self.stats = self.cc.create_experiment('stats')
            except ValueError:
                self.cc.remove_experiment('stats')
                self.stats = self.cc.create_experiment('stats')

    def step(self):
        samples = self.memory.sample()
        states, actions, rewards, next_states = map(
            lambda x: np.asarray(x), zip(*samples))

        # Update critic network
        self.critic_optimizer.zero_grad()

        outputs = self.agent.critic(Variable(torch.from_numpy(
            states)), Variable(torch.from_numpy(actions)))

        no_final_states = np.array(
            [ns for ns in next_states if ns is not None])
        no_final_targets = self.agent.critic_target(Variable(torch.from_numpy(no_final_states), volatile=True),
                                                    self.agent.actor_target(Variable(torch.from_numpy(no_final_states), volatile=True)))
        targets = Variable(torch.zeros(self.memory.batch_size, 1))
        mask = Variable(torch.ByteTensor(
            [ns is not None for ns in next_states]).view(-1, 1))
        targets.masked_copy_(mask, no_final_targets)
        targets = self.gamma * targets + \
            Variable(torch.from_numpy(rewards).unsqueeze(1))
        targets = targets.detach()

        loss = self.critic_criterion(outputs, targets)

        if self.crayon_vis:
            self.stats.add_scalar_value('critic loss', loss.data[0])

        loss.backward()

        # critic_visualizer = pytorch_net_visualizer(loss)
        # critic_visualizer.view()
        # input("Visualizing critic networks...")

        '''
        # gradient clamping in case of gradient explosion
        for param in self.agent.critic.parameters():
            param.grad.data.clamp_(-1, 1)
        '''

        self.critic_optimizer.step()

        # Update actor network
        self.critic_optimizer.zero_grad()
        self.actor_optimizer.zero_grad()

        outputs = self.agent.critic(Variable(torch.from_numpy(states), requires_grad=False),
                                    self.agent.actor(Variable(torch.from_numpy(states), requires_grad=True)))
        # negation since we want the police increase the likelihood of good reward trajectory
        outputs = -torch.mean(outputs)
        outputs.backward()

        # actor_visualizer = pytorch_net_visualizer(outputs)
        # actor_visualizer.view()
        # input("Visualizing actor networks...")

        self.actor_optimizer.step()

        # Update target network
        for param, param_target in zip(self.agent.critic.parameters(), self.agent.critic_target.parameters()):
            param_target = self.tau * param + (1 - self.tau) * param_target

        for param, param_target in zip(self.agent.actor.parameters(), self.agent.actor_target.parameters()):
            param_target = self.tau * param + (1 - self.tau) * param_target

        return loss.data[0]
示例#15
0
    val_loader = LoaderFactory.create_dataloader(LoaderMode.VAL,
                                                 args,
                                                 do_use_gpu=args.cuda)
if args.do_test:
    test_loader = LoaderFactory.create_dataloader(LoaderMode.TEST,
                                                  args,
                                                  do_use_gpu=args.cuda)

#%% Create logger
if args.do_train:
    # Connect Crayon Logger (TensorBoard "wrapper") to the server
    cc = CrayonClient(hostname="localhost", port=8889)
    tb_log_exp_name = args.exp_name
    # Remove previous experiment
    try:
        cc.remove_experiment(tb_log_exp_name)
    except ValueError:
        # experiment doesn't already exist - nothing to be done here
        print("Experiment '{}' didn't exist already (nothing to be done).".format(\
            tb_log_exp_name))
    # Create a new experiment
    tb_log = cc.create_experiment(tb_log_exp_name)

#%% Train (Load) model
# Create and init predictor
model = NetFactory.create_net(net_type=args.net_type,
                              params=args,
                              num_prior_dims=args.num_embedding_dims,
                              num_cond_dims=args.num_cond_dims,
                              num_joints=args.num_joints,
                              num_features=args.num_features)
示例#16
0
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from RAN import restorator, discirminator
from patch_wise import patch

from pycrayon import CrayonClient
import time

cc = CrayonClient(hostname="localhost", port=8889)

cc.remove_experiment('d_real_error')
cc.remove_experiment('d_fake_error')
cc.remove_experiment('g_error')
d_real_errorC = cc.create_experiment('d_real_error')
d_fake_errorC = cc.create_experiment('d_fake_error')
g_errorC = cc.create_experiment('g_error')


def extract(v):
    return v.data.storage().tolist()

print 'Starting my Restoration Adversarial Net...'

torch.manual_seed(123)
torch.cuda.manual_seed(123)

patchSize = 64
patches = patch()
示例#17
0
print('lr', lookup_lr(cfg, start_epoch))
print('-------------------------------')

# tensorboad
use_tensorboard = cfg.use_tensorboard and CrayonClient is not None

use_tensorboard = False
remove_all_log = True
if use_tensorboard:
    cc = CrayonClient(hostname='127.0.0.1')
    if remove_all_log:
        print('remove all experiments')
        cc.remove_all_experiments()
    if start_epoch == 0:
        try:
            cc.remove_experiment(cfg.exp_name)
        except ValueError:
            pass
        exp = cc.create_experiment(cfg.exp_name)
    else:
        exp = cc.open_experiment(cfg.exp_name)

train_loss = 0
bbox_loss, iou_loss, cls_loss = 0., 0., 0.
cnt = 0

timer = Timer()

# default input size
network_size = cfg.inp_size
示例#18
0
    print("WARNING: You have a CUDA device, should run with -gpus 0")

if opt.gpus:
    cuda.set_device(opt.gpus[0])
    if opt.seed > 0:
        torch.cuda.manual_seed(opt.seed)

# Set up the Crayon logging server.
if opt.log_server != "":
    from pycrayon import CrayonClient
    cc = CrayonClient(hostname=opt.log_server)

    experiments = cc.get_experiment_names()
    print(experiments)
    if opt.experiment_name in experiments:
        cc.remove_experiment(opt.experiment_name)
    experiment = cc.create_experiment(opt.experiment_name)


def eval(model, criterion, data):
    stats = onmt.Loss.Statistics()
    model.eval()
    loss = onmt.Loss.MemoryEfficientLoss(opt,
                                         model.generator,
                                         criterion,
                                         eval=True,
                                         copy_loss=opt.copy_attn)
    for i in range(len(data)):
        batch = data[i]
        outputs, attn, dec_hidden = model(batch.src, batch.tgt, batch.lengths)
        batch_stats, _, _ = loss.loss(batch, outputs, attn)
示例#19
0
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable


from pycrayon import CrayonClient
import time

cc = CrayonClient(hostname="10.150.6.120")
cc.remove_experiment("OMIE_2")
OMIE = cc.create_experiment("OMIE_2")
###
### noise level one
### dimension 2
### z는 따로 추출
###
input_size = 2
hidden_size = 5
hidden_size_ = 3
num_classes = 1

num_epochs = 9

learning_rate = 0.00001
debug_mode = True

示例#20
0
# Create a new experiment
foo = cc.create_experiment("foo")

# Send some scalar values to the server
foo.add_scalar_value("accuracy", 0, wall_time=11.3)
foo.add_scalar_value("accuracy", 4, wall_time=12.3)
# You can force the time and step values
foo.add_scalar_value("accuracy", 6, wall_time=13.3, step=4)

# Get the datas sent to the server
foo.get_scalar_values("accuracy")
# >> [[11.3, 0, 0.0], [12.3, 1, 4.0], [13.3, 4, 6.0]])

# backup this experiment as a zip file
filename = foo.to_zip()

# delete this experiment from the server
cc.remove_experiment("foo")
# using the `foo` object from now on will result in an error

# Create a new experiment based on foo's backup
bar = cc.create_experiment("bar", zip_file=filename)

# Get the name of all scalar plots in this experiment
bar.get_scalar_names()
# >> ["accuracy"]

# Get the data for this experiment
bar.get_scalar_values("accuracy")
# >> [[11.3, 0, 0.0], [12.3, 1, 4.0], [13.3, 4, 6.0]])
cc.remove_experiment("bar")
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy
import os
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from fine_tuning_config_file import *

## If you want to keep a track of your network on tensorboard, set USE_TENSORBOARD TO 1 in config file.

if USE_TENSORBOARD:
    from pycrayon import CrayonClient
    cc = CrayonClient(hostname=TENSORBOARD_SERVER)
    try:
        cc.remove_experiment(EXP_NAME)
    except:
        pass
    foo = cc.create_experiment(EXP_NAME)


## If you want to use the GPU, set GPU_MODE TO 1 in config file

use_gpu = GPU_MODE
if use_gpu:
    torch.cuda.set_device(CUDA_DEVICE)

count=0

### SECTION 2 - data loading and shuffling/augmentation/normalization : all handled by torch automatically.
示例#22
0
class CrayonWrapper:
    """
    Wraps PyCrayon (https://github.com/torrvision/crayon), a language-agnostic interface to TensorBoard.
    """
    def __init__(self,
                 name,
                 runs_distributed,
                 runs_cluster,
                 chief_handle,
                 path_log_storage=None,
                 crayon_server_address="localhost"):
        self._name = name
        self._path_log_storage = path_log_storage
        if path_log_storage is not None:
            create_dir_if_not_exist(path_log_storage)

        self._chief_handle = chief_handle
        self._crayon = CrayonClient(hostname=crayon_server_address)
        self._experiments = {}
        self.clear()
        self._custom_logs = {
        }  # dict of exps containing dict of graph names containing lists of {step: val, } dicts

        self._ray = MaybeRay(runs_distributed=runs_distributed,
                             runs_cluster=runs_cluster)

    @property
    def name(self):
        return self._name

    @property
    def path_log_storage(self):
        return self._path_log_storage

    def clear(self):
        """
        Does NOT clear crayon's internal experiment logs and files.
        """
        self._experiments = {}

    def export_all(self, iter_nr):
        """
        Exports all logs of the current run in Tensorboard's format and as json files.
        """
        if self._path_log_storage is not None:
            path_crayon = ospj(self._path_log_storage, str(self._name),
                               str(iter_nr), "crayon")
            path_json = ospj(self._path_log_storage, str(self._name),
                             str(iter_nr), "as_json")
            create_dir_if_not_exist(path=path_crayon)
            create_dir_if_not_exist(path=path_json)
            for e in self._experiments.values():
                e.to_zip(filename=ospj(path_crayon, e.xp_name + ".zip"))
                write_dict_to_file_json(dictionary=self._custom_logs,
                                        _dir=path_json,
                                        file_name="logs")

    def update_from_log_buffer(self):
        """
        Pulls newly added logs from the chief onto whatever worker CrayonWrapper runs on. It then adds all these new
        logs to Tensorboard (i.e. PyCrayon's docker container)
        """
        new_v, exp_names = self._get_new_vals()

        for e in exp_names:
            if e not in self._experiments.keys():
                self._custom_logs[e] = {}
                try:
                    self._experiments[e] = self._crayon.create_experiment(
                        xp_name=e)
                except ValueError:
                    self._crayon.remove_experiment(xp_name=e)
                    self._experiments[e] = self._crayon.create_experiment(
                        xp_name=e)

        for name, vals_dict in new_v.items():
            for graph_name, data_points in vals_dict.items():

                for data_point in data_points:
                    step = int(data_point[0])
                    val = data_point[1]

                    self._experiments[name].add_scalar_value(name=graph_name,
                                                             step=step,
                                                             value=val)
                    if graph_name not in self._custom_logs[name].keys():
                        self._custom_logs[name][graph_name] = []

                    self._custom_logs[name][graph_name].append({step: val})

    def _get_new_vals(self):
        """
        Returns:
            dict: Pulls and returns newly added logs from the chief onto whatever worker CrayonWrapper runs on.
        """
        return self._ray.get(
            self._ray.remote(self._chief_handle.get_new_values))
示例#23
0
        return y


if __name__ == '__main__':
    # GPUフラグ
    gpu_fg = util.gpuCheck(sys.argv)
    if gpu_fg >= 0:
        cuda.check_cuda_available()
    xp = cuda.cupy if gpu_fg >= 0 else np

    # pycrayon 初期化
    cc = CrayonClient(hostname="192.168.1.198", port=8889)
    # delete this experiment from the server
    try:
        cc.remove_experiment("AlexNet train (Adam)")
        cc.remove_experiment("AlexNet test (Adam)")
    except:
        pass

    # create a new experiment
    try:
        tb_alex_train = cc.create_experiment("AlexNet train (Adam)")
        tb_alex_test = cc.create_experiment("AlexNet test (Adam)")
    except:
        tb_alex_train = cc.open_experiment("AlexNet train (Adam)")
        tb_alex_test = cc.open_experiment("AlexNet test (Adam)")

    # x_train: 32*32*3
    train, test = get_cifar10()
    x_train, t_train = train._datasets
示例#24
0
        torch.cuda.manual_seed(opt.seed)

if len(opt.gpuid) > 1:
    sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n")
    sys.exit(1)

# Set up the Crayon logging server.
if opt.exp_host != "":
    from pycrayon import CrayonClient

    cc = CrayonClient(hostname=opt.exp_host)

    experiments = cc.get_experiment_names()
    print(experiments)
    if opt.exp in experiments:
        cc.remove_experiment(opt.exp)
    experiment = cc.create_experiment(opt.exp)

if opt.tensorboard:
    from tensorboardX import SummaryWriter
    writer = SummaryWriter(
        opt.tensorboard_log_dir + datetime.now().strftime("/%b-%d_%H-%M-%S"),
        comment="Onmt")

progress_step = 0


def report_func(epoch, batch, num_batches,
                progress_step,
                start_time, lr, report_stats):
    """
示例#25
0
                             lr=lr)

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# tensorboad
use_tensorboard = use_tensorboard and CrayonClient is not None
if use_tensorboard:
    cc = CrayonClient(hostname='127.0.0.1')
    if remove_all_log:
        cc.remove_all_experiments()
    if exp_name is None:
        exp_name = datetime.now().strftime('vgg16_%m-%d_%H-%M')
        exp_name = save_exp_name
        if exp_name in cc.get_experiment_names():
            cc.remove_experiment(exp_name)
        exp = cc.create_experiment(exp_name)
    else:
        exp = cc.open_experiment(exp_name)

# training
train_loss = 0
step_cnt = 0
re_cnt = False
t = Timer()
t.tic()

best_mae = sys.maxsize

for epoch in range(start_step, end_step + 1):
    step = -1
示例#26
0
if __name__ == '__main__':
    # GPUフラグ
    gpu_fg = util.gpuCheck(sys.argv)
    if gpu_fg >= 0:
        cuda.check_cuda_available()
    xp = cuda.cupy if gpu_fg >= 0 else np

    # pycrayon 初期化
<<<<<<< HEAD
    cc = CrayonClient(hostname="192.168.1.201", port=8889)
=======
    cc = CrayonClient(hostname="", port=8889)
>>>>>>> d6f052dba3a3f893fd80497288d9412d8cc1c097
    # delete this experiment from the server
    try:
        cc.remove_experiment("MNIST_DCGAN_GEN")
        cc.remove_experiment("MNIST_DCGAN_DIS")
    except:
        pass

    # create a new experiment
    try:
        tb_gen = cc.create_experiment("MNIST_DCGAN_GEN")
        tb_dis = cc.create_experiment("MNIST_DCGAN_DIS")
    except:
        tb_gen = cc.open_experiment("MNIST_DCGAN_GEN")
        tb_dis = cc.open_experiment("MNIST_DCGAN_DIS")
        

    # Training Data
    train, test = chainer.datasets.get_mnist()
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np
from pycrayon import CrayonClient
import time

cc = CrayonClient(hostname="10.150.6.120")

try:
    cc.remove_experiment("AnalyzeConv3")
except:
    pass

try:
    OMIE = cc.create_experiment("AnalyzeConv3")
except:
    pass

##
## noise level one
## dimension 2
### z는 따로 추출
###
input_size = 64
hidden_size = 128
示例#28
0
文件: train.py 项目: taguka/atlas
def main():
    config = DefaultConfigs()
    train_input_root = os.path.join(config.data)
    train_labels_file = 'labels.csv'

    if config.output:
        if not os.path.exists(config.output):
            os.makedirs(config.output)
        output_base = config.output
    else:
        if not os.path.exists(config.output):
            os.makedirs(config.output)
        output_base = config.output

    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"), config.model,
        str(config.img_size), 'f' + str(config.fold)
    ])
    mask_exp_name = '-'.join(
        [config.model,
         str(config.img_size), 'f' + str(config.fold)])
    mask_exp_name = glob.glob(
        os.path.join(output_base, 'train', '*' + mask_exp_name))
    if config.resume and mask_exp_name:
        output_dir = mask_exp_name
    else:
        output_dir = get_outdir(output_base, 'train', exp_name)

    batch_size = config.batch_size
    test_batch_size = config.test_batch_size
    num_epochs = config.epochs
    img_type = config.image_type
    img_size = (config.img_size, config.img_size)
    num_classes = get_tags_size(config.labels)

    torch.manual_seed(config.seed)

    dataset_train = HumanDataset(
        train_input_root,
        train_labels_file,
        train=True,
        multi_label=config.multi_label,
        img_type=img_type,
        img_size=img_size,
        fold=config.fold,
    )

    #sampler = WeightedRandomOverSampler(dataset_train.get_sample_weights())

    loader_train = data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        shuffle=True,
        #sampler=sampler,
        num_workers=config.num_processes)

    dataset_eval = HumanDataset(
        train_input_root,
        train_labels_file,
        train=False,
        multi_label=config.multi_label,
        img_type=img_type,
        img_size=img_size,
        test_aug=config.tta,
        fold=config.fold,
    )

    loader_eval = data.DataLoader(dataset_eval,
                                  batch_size=test_batch_size,
                                  shuffle=False,
                                  num_workers=config.num_processes)

    #    model = model_factory.create_model(
    #        config.model,
    #        pretrained=True,
    #        num_classes=num_classes,
    #        drop_rate=config.drop,
    #        global_pool=config.gp)

    model = get_net(config.model, num_classes, config.drop, config.channels)

    if not config.no_cuda:
        if config.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              config.num_gpu))).cuda()
        else:
            model.cuda()

    if config.opt.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=config.lr,
                              momentum=config.momentum,
                              weight_decay=config.weight_decay)
    elif config.opt.lower() == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=config.lr,
                               weight_decay=config.weight_decay)
    elif config.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=config.lr,
                                   weight_decay=config.weight_decay)
    elif config.opt.lower() == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=config.lr,
                                  alpha=0.9,
                                  momentum=config.momentum,
                                  weight_decay=config.weight_decay)
    elif config.opt.lower() == 'yellowfin':
        optimizer = YFOptimizer(model.parameters(),
                                lr=config.lr,
                                weight_decay=config.weight_decay,
                                clip_thresh=2)
    else:
        assert False and "Invalid optimizer"

    if not config.decay_epochs:
        lr_scheduler = ReduceLROnPlateau(optimizer, patience=8)
    else:
        lr_scheduler = None

    if config.class_weights:
        class_weights = torch.from_numpy(
            dataset_train.get_class_weights()).float()
        class_weights_norm = class_weights / class_weights.sum()
        if not config.no_cuda:
            class_weights = class_weights.cuda()
            class_weights_norm = class_weights_norm.cuda()
    else:
        class_weights = None
        class_weights_norm = None

    if config.loss.lower() == 'nll':
        #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.'
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    elif config.loss.lower() == 'mlsm':
        assert config.multi_label
        loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights)
    else:
        assert config and "Invalid loss function"

    if not config.no_cuda:
        loss_fn = loss_fn.cuda()

    # optionally resume from a checkpoint
    start_epoch = 1
    if config.resume:
        if os.path.isfile(config.resume):
            print("=> loading checkpoint '{}'".format(config.resume))
            checkpoint = torch.load(config.resume)
            config.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                config.resume, checkpoint['epoch']))
            start_epoch = checkpoint['epoch']
        else:
            print("=> no checkpoint found at '{}'".format(config.resume))
            exit(-1)

    use_tensorboard = not config.no_tb and CrayonClient is not None
    if use_tensorboard:
        hostname = '127.0.0.1'
        port = 8889
        host_port = config.tbh.split(':')[:2]
        if len(host_port) == 1:
            hostname = host_port[0]
        elif len(host_port) >= 2:
            hostname, port = host_port[:2]
        try:
            cc = CrayonClient(hostname=hostname, port=port)
            try:
                cc.remove_experiment(exp_name)
            except ValueError:
                pass
            exp = cc.create_experiment(exp_name)
        except Exception as e:
            exp = None
            print(
                "Error (%s) connecting to Tensoboard/Crayon server. Giving up..."
                % str(e))
    else:
        exp = None

    # Optional fine-tune of only the final classifier weights for specified number of epochs (or part of)
    if not config.resume and config.ft_epochs > 0.:
        if config.opt.lower() == 'adam':
            finetune_optimizer = optim.Adam(model.get_fc().parameters(),
                                            lr=config.ft_lr,
                                            weight_decay=config.weight_decay)
        else:
            finetune_optimizer = optim.SGD(model.get_fc().parameters(),
                                           lr=config.ft_lr,
                                           momentum=config.momentum,
                                           weight_decay=config.weight_decay)

        finetune_epochs_int = int(np.ceil(config.ft_epochs))
        finetune_final_batches = int(
            np.ceil((1 - (finetune_epochs_int - config.ft_epochs)) *
                    len(loader_train)))
        print(finetune_epochs_int, finetune_final_batches)
        for fepoch in range(1, finetune_epochs_int + 1):
            if fepoch == finetune_epochs_int and finetune_final_batches:
                batch_limit = finetune_final_batches
            else:
                batch_limit = 0
            train_epoch(fepoch,
                        model,
                        loader_train,
                        finetune_optimizer,
                        loss_fn,
                        config,
                        class_weights_norm,
                        output_dir,
                        batch_limit=batch_limit)
            step = fepoch * len(loader_train)
            score, _ = validate(step, model, loader_eval, loss_fn, config, 0.3,
                                output_dir)

    score_metric = 'f2'
    best_loss = None
    best_f2 = None
    threshold = 0.2
    try:
        for epoch in range(start_epoch, num_epochs + 1):
            if config.decay_epochs:
                adjust_learning_rate(optimizer,
                                     epoch,
                                     initial_lr=config.lr,
                                     decay_epochs=config.decay_epochs)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        loss_fn,
                                        config,
                                        class_weights_norm,
                                        output_dir,
                                        exp=exp)

            step = epoch * len(loader_train)
            eval_metrics, latest_threshold = validate(step,
                                                      model,
                                                      loader_eval,
                                                      loss_fn,
                                                      config,
                                                      threshold,
                                                      output_dir,
                                                      exp=exp)

            if lr_scheduler is not None:
                lr_scheduler.step(eval_metrics['eval_loss'])

            rowd = OrderedDict(epoch=epoch)
            rowd.update(train_metrics)
            rowd.update(eval_metrics)
            with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf:
                dw = csv.DictWriter(cf, fieldnames=rowd.keys())
                if best_loss is None:  # first iteration (epoch == 1 can't be used)
                    dw.writeheader()
                dw.writerow(rowd)

            best = False
            if best_loss is None or eval_metrics['eval_loss'] < best_loss[1]:
                best_loss = (epoch, eval_metrics['eval_loss'])
                if score_metric == 'loss':
                    best = True
            if best_f2 is None or eval_metrics['eval_f2'] > best_f2[1]:
                best_f2 = (epoch, eval_metrics['eval_f2'])
                if score_metric == 'f2':
                    best = True

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': config.model,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'threshold': latest_threshold,
                    'config': config
                },
                is_best=best,
                filename=os.path.join(config.checkpoint_path,
                                      'checkpoint-%d.pth.tar' % epoch),
                output_dir=output_dir)

    except KeyboardInterrupt:
        pass
    print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
    print('*** Best f2: {0} (epoch {1})'.format(best_f2[1], best_f2[0]))
from torch.utils import data as D
from albumentations import (
    HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Cutout, Transpose, ShiftScaleRotate, Blur, OpticalDistortion,
    GridDistortion, HueSaturationValue, IAAAdditiveGaussianNoise, GaussNoise,
    MotionBlur, MedianBlur, IAAPiecewiseAffine, IAASharpen, IAAEmboss,
    RandomContrast, RandomBrightness, Flip, OneOf, Compose, ToGray)
from patchwise import Quadrant

## If you want to keep a track of your network on tensorboard, set USE_TENSORBOARD TO 1 in config file.

if USE_TENSORBOARD:
    from pycrayon import CrayonClient
    cc = CrayonClient(hostname=TENSORBOARD_SERVER)
    try:
        cc.remove_experiment(EXP_NAME)
    except:
        pass
    foo = cc.create_experiment(EXP_NAME)

## If you want to use the GPU, set GPU_MODE TO 1 in config file

use_gpu = GPU_MODE
if use_gpu:
    torch.cuda.set_device(CUDA_DEVICE)

count = 0


def strong_aug(p=.5):
    return Compose([
示例#30
0
def eval_loop(counter, args, shared_model, model_eval):
    try:
        SEC_PER_DAY = 24 * 60 * 60

        env = build_env(args.type,
                        args,
                        treat_life_lost_as_terminal=False,
                        max_time=5 * 60)
        model = copy.deepcopy(shared_model)
        model.eval()

        # Create a new experiment
        vis = visdom.Visdom(env='A3C:' + args.name)

        cc = CrayonClient()
        names = cc.get_experiment_names()
        summaries = []
        for idx in range(args.n_eval):
            name = "{} [{}]".format(args.name, idx + 1)
            if name in names:
                cc.remove_experiment(name)
            summaries.append(cc.create_experiment(name))

        max_reward = None
        save_condition = args.save_intervel

        rewards = []
        start_time = time.time()
        while True:
            # Sync with the shared model
            model.load_state_dict(shared_model.state_dict())

            restart, eval_start_time, eval_start_step = False, time.time(
            ), counter.value
            results = []
            for i in range(args.n_eval):
                model.reset_state()
                results.append(model_eval(model, env, vis=(vis, i + 1, 60)))
                if env.exceed_max:
                    restart = True
                    env.reset()
                    break
                env.reset()

            if restart:
                continue

            eval_end_time, eval_end_step = time.time(), counter.value
            results = EvalResult(*zip(*results))
            rewards.append((counter.value, results.reward))

            local_max_reward = np.max(results.reward)
            if max_reward is None or max_reward < local_max_reward:
                max_reward = local_max_reward

            if local_max_reward >= max_reward:
                # Save model
                torch.save(model.state_dict(),
                           os.path.join(args.model_path, 'best_model.pth'))

            time_since_start = eval_end_time - start_time
            day = time_since_start // SEC_PER_DAY
            time_since_start %= SEC_PER_DAY

            seconds_to_finish = (args.n_steps - eval_end_step) / (
                eval_end_step - eval_start_step) * (eval_end_time -
                                                    eval_start_time)
            days_to_finish = seconds_to_finish // SEC_PER_DAY
            seconds_to_finish %= SEC_PER_DAY
            print("STEP:[{}|{}], Time: {}d {}, Finish in {}d {}".format(
                counter.value, args.n_steps, '%02d' % day,
                time.strftime("%Hh %Mm %Ss", time.gmtime(time_since_start)),
                '%02d' % days_to_finish,
                time.strftime("%Hh %Mm %Ss", time.gmtime(seconds_to_finish))))
            print(
                '\tMax reward: {}, avg_reward: {}, std_reward: {}, min_reward: {}, max_reward: {}'
                .format(max_reward, np.mean(results.reward),
                        np.std(results.reward), np.min(results.reward),
                        local_max_reward))

            # Plot
            for summary, reward in zip(summaries, results.reward):
                summary.add_scalar_value('reward',
                                         reward,
                                         step=eval_start_step)

            if counter.value > save_condition or counter.value >= args.n_steps:
                save_condition += args.save_intervel
                torch.save(
                    model.state_dict(),
                    os.path.join(args.model_path,
                                 'model_iter_{}.pth'.format(counter.value)))
                torch.save(model.state_dict(),
                           os.path.join(args.model_path, 'model_latest.pth'))

                with open(os.path.join(args.save_path, 'rewards'), 'a+') as f:
                    for record in rewards:
                        f.write('{}: {}\n'.format(record[0], record[1]))
                del rewards[:]

            if counter.value >= args.n_steps:
                print('Evaluator Finished !!!')
                break
    except KeyboardInterrupt:
        torch.save(shared_model.state_dict(),
                   os.path.join(args.model_path, 'model_latest.pth'))
        raise
示例#31
0
        torch.cuda.manual_seed(opt.seed)

if len(opt.gpuid) > 1:
    sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n")
    sys.exit(1)

# Set up the Crayon logging server.
if opt.exp_host != "":
    from pycrayon import CrayonClient

    cc = CrayonClient(hostname=opt.exp_host)

    experiments = cc.get_experiment_names()
    print(experiments)
    if opt.exp in experiments:
        cc.remove_experiment(opt.exp)
    experiment = cc.create_experiment(opt.exp)

if opt.tensorboard:
    from tensorboardX import SummaryWriter
    writer = SummaryWriter(
        opt.tensorboard_log_dir + datetime.now().strftime("/%b-%d_%H-%M-%S"),
        comment="Onmt")

progress_step = 0


def report_func(epoch, batch, num_batches,
                progress_step,
                start_time, lr, report_stats):
    """
示例#32
0
def main(args):

    #setup tensorboard
    if args.tensorboard:
        cc = CrayonClient(hostname="localhost")
        print(cc.get_experiment_names())
        #if args.name in cc.get_experiment_names():
        try:
            cc.remove_experiment(args.name)
        except:
            print("experiment didnt exist")
        cc_server = cc.create_experiment(args.name)

    # Create model directory
    full_model_path = args.model_path + "/" + args.name
    if not os.path.exists(full_model_path):
        os.makedirs(full_model_path)
    with open(full_model_path + "/parameters.json", 'w') as f:
        f.write((json.dumps(vars(args))))

    # Image preprocessing

    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    mini_transform = transforms.Compose(
        [transforms.ToPILImage(),
         transforms.Scale(20),
         transforms.ToTensor()])

    # Load vocabulary wrapper.
    if args.vocab_path is not None:
        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)
    else:
        print("building new vocab")
        vocab = build_vocab(args.image_dir, 1, None)
        with open((full_model_path + "/vocab.pkl"), 'wb') as f:
            pickle.dump(vocab, f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    code_data_set = ProcessingDataset(root=args.image_dir,
                                      vocab=vocab,
                                      transform=transform)
    train_ds, val_ds = validation_split(code_data_set)
    train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn)
    train_size = len(train_loader)
    test_size = len(test_loader)

    # Build the models
    encoder = EncoderCNN(args.embed_size, args.train_cnn)
    print(encoder)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    print(decoder)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    start_time = time.time()
    add_log_entry(args.name, start_time, vars(args))

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            decoder.train()
            encoder.train()
            # Set mini-batch dataset
            image_ts = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            count = images.size()[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(image_ts)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total = targets.size(0)
            max_index = outputs.max(dim=1)[1]
            #correct = (max_index == targets).sum()
            _, predicted = torch.max(outputs.data, 1)
            correct = predicted.eq(targets.data).cpu().sum()
            accuracy = 100. * correct / total

            if args.tensorboard:
                cc_server.add_scalar_value("train_loss", loss.data[0])
                cc_server.add_scalar_value("perplexity", np.exp(loss.data[0]))
                cc_server.add_scalar_value("accuracy", accuracy)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       accuracy, np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(full_model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(full_model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
            if 1 == 2 and i % int(train_size / 10) == 0:
                encoder.eval()
                #decoder.eval()
                correct = 0
                for ti, (timages, tcaptions,
                         tlengths) in enumerate(test_loader):
                    timage_ts = to_var(timages, volatile=True)
                    tcaptions = to_var(tcaptions)
                    ttargets = pack_padded_sequence(tcaptions,
                                                    tlengths,
                                                    batch_first=True)[0]
                    tfeatures = encoder(timage_ts)
                    toutputs = decoder(tfeatures, tcaptions, tlengths)
                    print(ttargets)
                    print(toutputs)
                    print(ttargets.size())
                    print(toutputs.size())
                    #correct = (ttargets.eq(toutputs[0].long())).sum()

                accuracy = 100 * correct / test_size
                print('accuracy: %.4f' % (accuracy))
                if args.tensorboard:
                    cc_server.add_scalar_value("accuracy", accuracy)

    torch.save(
        decoder.state_dict(),
        os.path.join(full_model_path,
                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    torch.save(
        encoder.state_dict(),
        os.path.join(full_model_path,
                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    end_time = time.time()
    print("finished training, runtime: %d", [(end_time - start_time)])