예제 #1
0
class Logger(object):
    def __init__(self, opt):
        self.exp_name = opt['name']
        self.use_tb_logger = opt['use_tb_logger']
        self.opt = opt['logger']
        self.log_dir = opt['path']['log']
        # loss log file
        self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt')
        with open(self.loss_log_path, 'a') as log_file:
            log_file.write('=============== Time: ' + get_timestamp() +
                           ' =============\n')
            log_file.write(
                '================ Training Losses ================\n')
        # val results log file
        self.val_log_path = os.path.join(self.log_dir, 'val_log.txt')
        with open(self.val_log_path, 'a') as log_file:
            log_file.write('================ Time: ' + get_timestamp() +
                           ' ===============\n')
            log_file.write(
                '================ Validation Results ================\n')
        if self.use_tb_logger and 'debug' not in self.exp_name:
            from tensorboard_logger import Logger as TensorboardLogger
            self.tb_logger = TensorboardLogger('../tb_logger/' + self.exp_name)

    def print_format_results(self, mode, rlt):
        epoch = rlt.pop('epoch')
        iters = rlt.pop('iters')
        time = rlt.pop('time')
        model = rlt.pop('model')
        if 'lr' in rlt:
            lr = rlt.pop('lr')
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format(
                epoch, iters, time, lr)
        else:
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format(
                epoch, iters, time)

        for label, value in rlt.items():
            if mode == 'train':
                message += '{:s}: {:.2e} '.format(label, value)
            elif mode == 'val':
                message += '{:s}: {:.4e} '.format(label, value)
            # tensorboard logger
            if self.use_tb_logger and 'debug' not in self.exp_name:
                self.tb_logger.log_value(label, value, iters)

        # print in console
        print(message)
        # write in log file
        if mode == 'train':
            with open(self.loss_log_path, 'a') as log_file:
                log_file.write(message + '\n')
        elif mode == 'val':
            with open(self.val_log_path, 'a') as log_file:
                log_file.write(message + '\n')

    def log_message(self, rlt):
        iters = rlt.pop('iters')
        for label, value in rlt.items():
            self.tb_logger.log_value(label, value, iters)
예제 #2
0
def test_smoke_logger(tmpdir):
    logger = Logger(str(tmpdir), flush_secs=0.1)
    for step in range(10):
        logger.log_value('v1', step * 1.5, step)
        logger.log_value('v2', step**1.5 - 2)
    time.sleep(0.5)
    tf_log, = tmpdir.listdir()
    assert tf_log.basename.startswith('events.out.tfevents.')
예제 #3
0
def test_dummy():
    logger = Logger(None, is_dummy=True)
    for step in range(3):
        logger.log_value('A v/1', step, step)
        logger.log_value('A v/2', step * 2, step)
    assert dict(logger.dummy_log) == {
        'A_v/1': [(0, 0), (1, 1), (2, 2)],
        'A_v/2': [(0, 0), (1, 2), (2, 4)],
    }
예제 #4
0
def test_unique():
    logger = Logger(None, is_dummy=True)
    for step in range(1, 3):
        # names that normalize to the same valid name
        logger.log_value('A v/1', step, step)
        logger.log_value('A\tv/1', step * 2, step)
        logger.log_value('A  v/1', step * 3, step)
    assert dict(logger.dummy_log) == {
        'A_v/1': [(1, 1), (2, 2)],
        'A_v/1/1': [(1, 2), (2, 4)],
        'A_v/1/2': [(1, 3), (2, 6)],
    }
예제 #5
0
def test_serialization(tmpdir):
    logger = Logger(str(tmpdir), flush_secs=0.1, dummy_time=256.5)
    logger.log_value('v/1', 1.5, 1)
    logger.log_value('v/22', 16.0, 2)
    time.sleep(0.5)
    tf_log, = tmpdir.listdir()
    assert tf_log.read_binary() == (
        # step = 0, initial record
        b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\x00\x00\x00\x08p@\x1a\rbrain.Event:2\xbc\x98!+'
        # v/1
        b'\x19\x00\x00\x00\x00\x00\x00\x00\x8b\xf1\x08(\t\x00\x00\x00\x00\x00\x08p@\x10\x01*\x0c\n\n\n\x03v/1\x15\x00\x00\xc0?,\xec\xc0\x87'
        # v/22
        b'\x1a\x00\x00\x00\x00\x00\x00\x00\x12\x9b\xd8-\t\x00\x00\x00\x00\x00\x08p@\x10\x02*\r\n\x0b\n\x04v/22\x15\x00\x00\x80A\x8f\xa3\xb6\x88'
    )
예제 #6
0
class Visualizer():
    def __init__(self, log_dir='runs/', **kwargs):
        self.tenbd = Logger(log_dir, flush_secs=10)

        self.index = {}

        self.log_text = ''

    def plot(self, name, y):
        x = self.index.get(name, 0)
        self.tenbd.log_value(name, y, x)
        self.index[name] = x + 1

    def plotMany(self, data):
        for k, v in data.iteritems():
            self.plot(k, v)
예제 #7
0
class Visualizer():
    '''
    封装了visdom,tensorboard_logger, 更方便记录loss
    '''
    def __init__(self, env='default', log_dir='runs/BiGRU', **kwargs):
        # self.vis = visdom.Visdom(env=env, **kwargs)
        self.tenbd = Logger(log_dir, flush_secs=2)

        # 记录数据的横向坐标{'img':2, 'loss':12}
        self.index = {}
        # 记录一些log信息
        self.log_text = ''

    # def reinit(self, env='default', **kwargs):
    #     '''
    #     更改visdom的配置
    #     '''
    #     self.vis =  visdom.Visdom(env=env, **kwargs)

    # return vis

    def plot(self, name, y):
        '''
        self.plot('loss',0.23)
        '''
        x = self.index.get(name, 0)
        # self.vis.line(Y=np.array([y]),
        #         X=np.array([x]),
        #         win=name,
        #         opts=dict(title=name),
        #         update=None if x==0 else 'append')
        self.tenbd.log_value(name, y, x)

        self.index[name] = x + 1

    def plotMany(self, data):
        '''
        一次渲染多个数据
        '''
        for k, v in data.iteritems():
            self.plot(k, v)

    def log(self, info, win='log_text'):
        '''
class Logger:
    """
    Deals with writing tensorboard summaries.
    And logging metric history to a pickle file
    """
    def __init__(self, outdir):
        self.outdir = outdir
        self.tf_logger = TFLogger(os.path.join(outdir, 'run'), flush_secs=2)
        self.metric_history: Dict = defaultdict(list)

    def log_metrics(self, phase, metrics, global_step):
        """ Logs scalar values as tf summaries.
            Don't bother with true_mean, it stays the same
            and doesn't really work as a graph. """
        for name, value in metrics.items():
            if name != "true_mean":
                self.tf_logger.log_value(f"{phase} {name}", value, global_step)

        # save standard pickle object for easy matloblib plot or perf over epochs
        self.metric_history[phase].append(metrics)
        with open(os.path.join(self.outdir, "metric_history.pkl"),
                  "wb") as metric_file:
            pickle.dump(self.metric_history, metric_file)
예제 #9
0
# -*- coding: utf-8 -*-
# @TIME : 2021/3/26 12:37
# @AUTHOR : Xu Bai
# @FILE : 5-3-1.TensorBoard.py
# @DESCRIPTION :
from tensorboard_logger import Logger

# ternsorboard --logdir experimient_cnn
# 构建logger对象,logdir用来指定log文件路径
# flush_secs指定刷新同步间隔
logger = Logger(logdir='experimient_cnn', flush_secs=2)
for ii in range(100):
    logger.log_value('loss', 10 - ii * .5, step=ii)
    logger.log_value('accuracy', ii * .5 / 10)
예제 #10
0
파일: main.py 프로젝트: yyyu200/ecg_pytorch
def train(args):
    # model
    model = getattr(models, config.model_name)()
    if args.ckpt and not args.resume:
        state = torch.load(args.ckpt, map_location='cpu')
        model.load_state_dict(state['state_dict'])
        print('train with pretrained weight val_f1', state['f1'])
    model = model.to(device)
    # data
    train_dataset = ECGDataset(data_path=config.train_data, train=True)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=6)
    val_dataset = ECGDataset(data_path=config.train_data, train=False)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=config.batch_size,
                                num_workers=4)
    print("train_datasize", len(train_dataset), "val_datasize",
          len(val_dataset))
    # optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
    criterion = utils.WeightedMultilabel(w)
    # 模型保存文件夹
    model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name,
                                   time.strftime("%Y%m%d%H%M"))
    if args.ex: model_save_dir += args.ex
    best_f1 = -1
    lr = config.lr
    start_epoch = 1
    stage = 1
    # 从上一个断点,继续训练
    if args.resume:
        if os.path.exists(args.ckpt):  # 这里是存放权重的目录
            model_save_dir = args.ckpt
            current_w = torch.load(os.path.join(args.ckpt, config.current_w))
            best_w = torch.load(os.path.join(model_save_dir, config.best_w))
            best_f1 = best_w['loss']
            start_epoch = current_w['epoch'] + 1
            lr = current_w['lr']
            stage = current_w['stage']
            model.load_state_dict(current_w['state_dict'])
            # 如果中断点恰好为转换stage的点
            if start_epoch - 1 in config.stage_epoch:
                stage += 1
                lr /= config.lr_decay
                utils.adjust_learning_rate(optimizer, lr)
                model.load_state_dict(best_w['state_dict'])
            print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))
    logger = Logger(logdir=model_save_dir, flush_secs=2)
    # =========>开始训练<=========
    for epoch in range(start_epoch, config.max_epoch + 1):
        since = time.time()
        train_loss, train_f1 = train_epoch(model,
                                           optimizer,
                                           criterion,
                                           train_dataloader,
                                           show_interval=100)
        val_loss, val_f1 = val_epoch(model, criterion, val_dataloader)
        print(
            '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f  val_loss:%0.3e val_f1:%.3f time:%s\n'
            % (epoch, stage, train_loss, train_f1, val_loss, val_f1,
               utils.print_time_cost(since)))
        logger.log_value('train_loss', train_loss, step=epoch)
        logger.log_value('train_f1', train_f1, step=epoch)
        logger.log_value('val_loss', val_loss, step=epoch)
        logger.log_value('val_f1', val_f1, step=epoch)
        state = {
            "state_dict": model.state_dict(),
            "epoch": epoch,
            "loss": val_loss,
            'f1': val_f1,
            'lr': lr,
            'stage': stage
        }
        save_ckpt(state, best_f1 < val_f1, model_save_dir)
        best_f1 = max(best_f1, val_f1)
        if epoch in config.stage_epoch:
            stage += 1
            lr /= config.lr_decay
            best_w = os.path.join(model_save_dir, config.best_w)
            model.load_state_dict(torch.load(best_w)['state_dict'])
            print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
            utils.adjust_learning_rate(optimizer, lr)
예제 #11
0
def main():

    DIR = args.DIR
    embedding_file = args.embedding_dir

    best_network_file = "./model/network_model_pretrain.best.top"
    print >> sys.stderr, "Read model from ", best_network_file
    best_network_model = torch.load(best_network_file)

    embedding_matrix = numpy.load(embedding_file)
    "Building torch model"
    worker = network.Network(
        nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"],
        nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000,
        nnargs["embedding_size"], nnargs["embedding_dimention"],
        embedding_matrix).cuda()
    net_copy(worker, best_network_model)

    best_network_file = "./model/network_model_pretrain.best.top"
    print >> sys.stderr, "Read model from ", best_network_file
    best_network_model = torch.load(best_network_file)

    manager = network.Network(
        nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"],
        nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000,
        nnargs["embedding_size"], nnargs["embedding_dimention"],
        embedding_matrix).cuda()
    net_copy(manager, best_network_model)

    reduced = ""
    if args.reduced == 1:
        reduced = "_reduced"

    print >> sys.stderr, "prepare data for train ..."
    #train_docs_iter = DataReader.DataGnerater("train"+reduced)
    train_docs_iter = DataReader.DataGnerater("dev" + reduced)
    print >> sys.stderr, "prepare data for dev and test ..."
    dev_docs_iter = DataReader.DataGnerater("dev" + reduced)
    test_docs_iter = DataReader.DataGnerater("test" + reduced)

    print "Performance after pretraining..."
    print "DEV"
    metric = performance.performance(dev_docs_iter, worker, manager)
    print "Average:", metric["average"]
    print "TEST"
    metric = performance.performance(test_docs_iter, worker, manager)
    print "Average:", metric["average"]
    print "***"
    print
    sys.stdout.flush()

    lr = nnargs["lr"]
    top_k = nnargs["top_k"]

    model_save_dir = "./model/reinforce/"
    utils.mkdir(model_save_dir)

    score_softmax = nn.Softmax()

    optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6)
    optimizer_worker = optim.RMSprop(worker.parameters(), lr=lr, eps=1e-6)

    MAX_AVE = 2048

    for echo in range(nnargs["epoch"]):

        start_time = timeit.default_timer()
        print "Pretrain Epoch:", echo

        reward_log = Logger(Tensorboard + args.tb +
                            "/acl2018/%d/reward/" % echo,
                            flush_secs=3)
        entropy_log_manager = Logger(Tensorboard + args.tb +
                                     "/acl2018/%d/entropy/worker" % echo,
                                     flush_secs=3)
        entropy_log_worker = Logger(Tensorboard + args.tb +
                                    "/acl2018/%d/entropy/manager" % echo,
                                    flush_secs=3)

        #train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl')
        train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl')
        docs_by_id = {doc.did: doc for doc in train_docs}

        ave_reward = []
        ave_manager_entropy = []
        ave_worker_entropy = []

        print >> sys.stderr, "Link docs ..."
        tmp_data = []
        cluster_info = {0: [0]}
        cluster_list = [0]
        current_new_cluster = 1
        predict_action_embedding = []
        choose_action = []
        mid = 1

        step = 0

        statistic = {
            "worker_hits": 0,
            "manager_hits": 0,
            "total": 0,
            "manager_predict_last": 0,
            "worker_predict_last": 0
        }

        for data in train_docs_iter.rl_case_generater(shuffle=True):

            rl = data["rl"]

            scores_manager, representations_manager = get_score_representations(
                manager, data)

            for s, e in zip(rl["starts"], rl["ends"]):
                action_embeddings = representations_manager[s:e]

                probs = F.softmax(torch.transpose(scores_manager[s:e], 0, 1))

                m = Categorical(probs)
                this_action = m.sample()
                index = this_action.data.cpu().numpy()[0]

                if index == (e - s - 1):
                    should_cluster = current_new_cluster
                    cluster_info[should_cluster] = []
                    current_new_cluster += 1
                else:
                    should_cluster = cluster_list[index]

                choose_action.append(index)
                cluster_info[should_cluster].append(mid)
                cluster_list.append(should_cluster)
                mid += 1

                cluster_indexs = torch.cuda.LongTensor(
                    cluster_info[should_cluster])
                action_embedding_predict = torch.mean(
                    action_embeddings[cluster_indexs], 0, keepdim=True)
                predict_action_embedding.append(action_embedding_predict)

            tmp_data.append(data)

            if rl["end"] == True:

                inside_index = 0
                manager_path = []
                worker_path = []

                doc = docs_by_id[rl["did"]]

                for data in tmp_data:

                    rl = data["rl"]
                    pair_target = data["pair_target"]
                    anaphoricity_target = 1 - data["anaphoricity_target"]
                    target = numpy.concatenate(
                        (pair_target, anaphoricity_target))[rl["reindex"]]

                    scores_worker, representations_worker = get_score_representations(
                        worker, data)

                    for s, e in zip(rl["starts"], rl["ends"]):
                        action_embeddings = representations_worker[s:e]
                        score = score_softmax(
                            torch.transpose(scores_worker[s:e], 0,
                                            1)).data.cpu().numpy()[0]

                        action_embedding_choose = predict_action_embedding[
                            inside_index]
                        similarities = torch.sum(
                            torch.abs(action_embeddings -
                                      action_embedding_choose), 1)
                        similarities = similarities.data.cpu().numpy()

                        action_probabilities = []
                        action_list = []
                        action_candidates = heapq.nlargest(
                            top_k, -similarities)
                        for action in action_candidates:
                            action_index = numpy.argwhere(
                                similarities == -action)[0][0]
                            action_probabilities.append(score[action_index])
                            action_list.append(action_index)

                        manager_action = choose_action[inside_index]
                        if not manager_action in action_list:
                            action_list.append(manager_action)
                            action_probabilities.append(score[manager_action])

                        this_target = target[s:e]
                        manager_action = choose_action[inside_index]

                        sample_action = utils.sample_action(
                            numpy.array(action_probabilities))
                        worker_action = action_list[sample_action]

                        if this_target[worker_action] == 1:
                            statistic["worker_hits"] += 1
                        if this_target[manager_action] == 1:
                            statistic["manager_hits"] += 1
                        if worker_action == (e - s - 1):
                            statistic["worker_predict_last"] += 1
                        if manager_action == (e - s - 1):
                            statistic["manager_predict_last"] += 1
                        statistic["total"] += 1

                        inside_index += 1

                        #link = manager_action
                        link = worker_action
                        m1, m2 = rl['ids'][s + link]
                        doc.link(m1, m2)

                        manager_path.append(manager_action)
                        worker_path.append(worker_action)

                reward = doc.get_f1()
                for data in tmp_data:
                    for s, e in zip(rl["starts"], rl["ends"]):
                        ids = rl['ids'][s:e]
                        ana = ids[0, 1]
                        old_ant = doc.ana_to_ant[ana]
                        doc.unlink(ana)
                        costs = rl['costs'][s:e]
                        for ant_ind in range(e - s):
                            costs[ant_ind] = doc.link(ids[ant_ind, 0],
                                                      ana,
                                                      hypothetical=True,
                                                      beta=1)
                        doc.link(old_ant, ana)
                        #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor))

                inside_index = 0
                worker_entropy = 0.0

                for data in tmp_data:
                    new_step = step
                    # worker
                    scores_worker, representations_worker = get_score_representations(
                        worker, data, dropout=nnargs["dropout_rate"])
                    optimizer_worker.zero_grad
                    worker_loss = None
                    for s, e in zip(rl["starts"], rl["ends"]):
                        costs = rl['costs'][s:e]
                        costs = autograd.Variable(
                            torch.from_numpy(costs).type(
                                torch.cuda.FloatTensor))
                        action = worker_path[inside_index]
                        score = F.softmax(
                            torch.transpose(scores_worker[s:e], 0, 1))
                        if not score.size()[1] == costs.size()[0]:
                            continue
                        score = torch.squeeze(score)

                        baseline = torch.sum(costs * score)
                        this_cost = torch.log(
                            score[action]) * -1.0 * (reward - baseline)

                        if worker_loss is None:
                            worker_loss = this_cost
                        else:
                            worker_loss += this_cost
                        worker_entropy += torch.sum(
                            score * torch.log(score + 1e-7)
                        ).data.cpu().numpy()[
                            0]  #+ 0.001*torch.sum(score*torch.log(score+1e-7))
                        inside_index += 1

                    worker_loss.backward()
                    torch.nn.utils.clip_grad_norm(worker.parameters(),
                                                  nnargs["clip"])
                    optimizer_worker.step()

                    ave_worker_entropy.append(worker_entropy)
                    if len(ave_worker_entropy) >= MAX_AVE:
                        ave_worker_entropy = ave_worker_entropy[1:]
                    entropy_log_worker.log_value(
                        'entropy',
                        float(sum(ave_worker_entropy)) /
                        float(len(ave_worker_entropy)), new_step)
                    new_step += 1

                inside_index = 0
                manager_entropy = 0.0
                for data in tmp_data:
                    new_step = step
                    rl = data["rl"]

                    ave_reward.append(reward)
                    if len(ave_reward) >= MAX_AVE:
                        ave_reward = ave_reward[1:]
                    reward_log.log_value(
                        'reward',
                        float(sum(ave_reward)) / float(len(ave_reward)),
                        new_step)

                    scores_manager, representations_manager = get_score_representations(
                        manager, data, dropout=nnargs["dropout_rate"])

                    optimizer_manager.zero_grad
                    manager_loss = None
                    for s, e in zip(rl["starts"], rl["ends"]):
                        score = F.softmax(
                            torch.transpose(scores_manager[s:e], 0, 1))
                        costs = rl['costs'][s:e]
                        costs = autograd.Variable(
                            torch.from_numpy(costs).type(
                                torch.cuda.FloatTensor))
                        if not score.size()[1] == costs.size()[0]:
                            continue

                        action = manager_path[inside_index]
                        score = torch.squeeze(score)

                        baseline = torch.sum(costs * score)
                        this_cost = torch.log(score[action]) * -1.0 * (
                            reward - baseline
                        )  # + 0.001*torch.sum(score*torch.log(score+1e-7))

                        #this_cost = torch.sum(score*costs) + 0.001*torch.sum(score*torch.log(score+1e-7))

                        if manager_loss is None:
                            manager_loss = this_cost
                        else:
                            manager_loss += this_cost

                        manager_entropy += torch.sum(
                            score *
                            torch.log(score + 1e-7)).data.cpu().numpy()[0]
                        inside_index += 1

                    manager_loss.backward()
                    torch.nn.utils.clip_grad_norm(manager.parameters(),
                                                  nnargs["clip"])
                    optimizer_manager.step()

                    ave_manager_entropy.append(manager_entropy)
                    if len(ave_manager_entropy) >= MAX_AVE:
                        ave_manager_entropy = ave_manager_entropy[1:]
                    entropy_log_manager.log_value(
                        'entropy',
                        float(sum(ave_manager_entropy)) /
                        float(len(ave_manager_entropy)), new_step)
                    new_step += 1

                step = new_step
                tmp_data = []
                cluster_info = {0: [0]}
                cluster_list = [0]
                current_new_cluster = 1
                mid = 1
                predict_action_embedding = []
                choose_action = []

        end_time = timeit.default_timer()
        print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time -
                                                            start_time)
        print >> sys.stderr, "save model ..."
        #print "Top k",top_k
        print "Worker Hits", statistic[
            "worker_hits"], "Manager Hits", statistic[
                "manager_hits"], "Total", statistic["total"]
        print "Worker predict last", statistic[
            "worker_predict_last"], "Manager predict last", statistic[
                "manager_predict_last"]
        #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo)
        #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo)

        print "DEV"
        metric = performance.performance(dev_docs_iter, worker, manager)
        print "Average:", metric["average"]
        print "DEV manager"
        metric = performance_manager.performance(dev_docs_iter, worker,
                                                 manager)
        print "Average:", metric["average"]
        print "TEST"
        metric = performance.performance(test_docs_iter, worker, manager)
        print "Average:", metric["average"]
        print
        sys.stdout.flush()
예제 #12
0
if not args.predict:
	if args.set == 'train':
		train_batch_idx = 0
		for epoch in range(args.epochs):
			for batch_idx, sample in enumerate(DataLoaderDict['train']):
				model.zero_grad()
				model.hidden = model.init_hidden(args.batch_size)
				x = sample['data'].transpose(0,1)
				y = sample['label'].transpose(0,1)
				if use_gpu:
					x, y = x.cuda(), y.cuda()
				pred = model(x)
				loss = loss_function(pred.transpose(1,2),y)
				loss.backward()
				optimizer.step()
				logger.log_value('train_loss', loss, train_batch_idx)
				# scheduler.step()
				pred = pred.argmax(2)
				correct = y.eq(pred.long()).sum()
				# Tensor elements always return tensors? 
				# Had to use tolist to return as int
				acc = 100*correct.tolist()/pred.nelement()
				logger.log_value('train_accuracy', acc, train_batch_idx)
				print(
					'Train:[{}|{}]\tloss: {:.4f}\taccuracy: {:.4f}'.format(
					epoch, batch_idx, loss, acc))
				if train_batch_idx % args.test_every == 0:
					''' Training will periodically test on val set or test set 
					    if validation doesn't exist
					'''
					if len(DataLoaderDict['val']) == 1:
예제 #13
0
class Logger(object):
    def __init__(self, opt, tb_logger_suffix=''):
        self.exp_name = opt['name']
        self.use_tb_logger = opt['use_tb_logger']
        self.opt = opt['logger']
        self.log_dir = opt['path']['log']
        if not os.path.isdir(self.log_dir):
            os.mkdir(self.log_dir)
        # loss log file
        self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt')
        with open(self.loss_log_path, 'a') as log_file:
            log_file.write('=============== Time: ' + get_timestamp() +
                           ' =============\n')
            log_file.write(
                '================ Training Losses ================\n')
        # val results log file
        self.val_log_path = os.path.join(self.log_dir, 'val_log.txt')
        with open(self.val_log_path, 'a') as log_file:
            log_file.write('================ Time: ' + get_timestamp() +
                           ' ===============\n')
            log_file.write(
                '================ Validation Results ================\n')
        if self.use_tb_logger:  # and 'debug' not in self.exp_name:
            from tensorboard_logger import Logger as TensorboardLogger
            logger_dir_num = 0
            tb_logger_dir = self.log_dir.replace('experiments', 'logs')
            if not os.path.isdir(tb_logger_dir):
                os.mkdir(tb_logger_dir)
            existing_dirs = sorted([
                dir.split('_')[0] for dir in os.listdir(tb_logger_dir)
                if os.path.isdir(os.path.join(tb_logger_dir, dir))
            ],
                                   key=lambda x: int(x.split('_')[0]))
            if len(existing_dirs) > 0:
                logger_dir_num = int(existing_dirs[-1]) + 1
            self.tb_logger = TensorboardLogger(
                os.path.join(tb_logger_dir,
                             str(logger_dir_num) + tb_logger_suffix))

    def print_format_results(self,
                             mode,
                             rlt,
                             dont_print=False,
                             keys_ignore_list=[]):
        epoch = rlt.pop('epoch')
        iters = rlt.pop('iters')
        time = rlt.pop('time')
        model = rlt.pop('model')
        if 'lr' in rlt:
            lr = rlt.pop('lr')
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format(
                epoch, iters, time, lr)
        else:
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format(
                epoch, iters, time)

        for label, value in rlt.items():
            if label in keys_ignore_list or '_baseline' in label:
                continue
            if mode == 'train':
                message += '{:s}: {:.4e} '.format(label, value)
            elif mode == 'val':
                message += '{:s}: {:.4e} '.format(label, value)
            # tensorboard logger
            if self.use_tb_logger:  # and 'debug' not in self.exp_name:
                self.tb_logger.log_value(label, value, iters)

        # print in console
        if not dont_print:
            print(message)
        # write in log file
        if mode == 'train':
            with open(self.loss_log_path, 'a') as log_file:
                log_file.write(message + '\n')
        elif mode == 'val':
            with open(self.val_log_path, 'a') as log_file:
                log_file.write(message + '\n')
예제 #14
0
def train(args):
    # model
    print(args.model_name)
    config.train_data = config.train_data + str(args.fold) + '.pth'
    #    config.train_data = config.train_data + 'trainsfer_' + str(args.fold) + '.pth'

    config.model_name = args.model_name
    model = getattr(models, config.model_name)()

    model = model.to(device)
    # data
    if args.model_kind == 1:
        import dataset2
        train_dataset = dataset2.ECGDataset(data_path=config.train_data,
                                            train=True,
                                            transform=True)
        train_dataloader = DataLoader(train_dataset,
                                      collate_fn=my_collate_fn,
                                      batch_size=config.batch_size,
                                      shuffle=True,
                                      num_workers=6)
    else:
        train_dataset = ECGDataset(data_path=config.train_data,
                                   train=True,
                                   transform=True)
        train_dataloader = DataLoader(
            train_dataset,  #collate_fn=my_collate_fn,
            batch_size=config.batch_size,
            shuffle=True,
            num_workers=6)

    val_dataset = ECGDataset(data_path=config.train_data, train=False)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=config.batch_size,
                                num_workers=6)
    print("train_datasize", len(train_dataset), "val_datasize",
          len(val_dataset))
    # optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    # optimizer = optim.RMSprop(model.parameters(), lr=config.lr)
    w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
    if args.model_kind == 1:
        criterion = utils.WeightedMultilabel(w)
        print(1)
    else:
        criterion = utils2.WeightedMultilabel(w)


#    criterion = utils.My_loss(w)
# 模型保存文件夹
    model_save_dir = '%s/%s' % (config.ckpt + str(args.model_kind),
                                config.model_name + '_' + str(args.fold))
    args.ckpt = model_save_dir
    # if args.ex: model_save_dir += args.ex
    best_f1 = -1
    lr = config.lr
    start_epoch = 1
    stage = 1
    # 从上一个断点,继续训练
    if args.resume:
        if os.path.exists(args.ckpt):  # 这里是存放权重的目录
            # model_save_dir = args.ckpt
            current_w = torch.load(os.path.join(args.ckpt, config.current_w))
            best_w = torch.load(os.path.join(model_save_dir, config.best_w))
            best_f1 = best_w['best_f']
            start_epoch = current_w['epoch'] + 1
            lr = current_w['lr']
            stage = current_w['stage']
            model.load_state_dict(current_w['state_dict'])
            # 如果中断点恰好为转换stage的点
            if start_epoch - 1 in config.stage_epoch:
                stage += 1
                lr /= config.lr_decay
                utils.adjust_learning_rate(optimizer, lr)
                model.load_state_dict(best_w['state_dict'])
            print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))
    else:
        path = '%s/%s' % (config.ckpt, config.model_name + '_transfer')
        print(path)
        current_w = torch.load(os.path.join(path, config.best_w))
        model.load_state_dict(current_w['state_dict'])

    logger = Logger(logdir=model_save_dir, flush_secs=2)
    # =========>开始训练<=========
    val_loss = 10
    val_f1 = -1
    state = {}
    for epoch in range(start_epoch, config.max_epoch + 1):
        since = time.time()
        train_loss, train_f1, best_f1 = train_epoch(
            model, optimizer, criterion, train_dataloader, epoch, lr, best_f1,
            val_dataloader, model_save_dir, state, 0)
        # if epoch % 2 == 1:
        val_loss, val_f1, _, _ = val_epoch(model, criterion, val_dataloader)
        print(
            '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f  val_loss:%0.3e val_f1:%.3f time:%s'
            % (epoch, stage, train_loss, train_f1, val_loss, val_f1,
               utils.print_time_cost(since)))
        logger.log_value('train_loss', train_loss, step=epoch)
        logger.log_value('train_f1', train_f1, step=epoch)
        logger.log_value('val_loss', val_loss, step=epoch)
        logger.log_value('val_f1', val_f1, step=epoch)
        state = {
            "state_dict": model.state_dict(),
            "epoch": epoch,
            "loss": val_loss,
            'f1': val_f1,
            'lr': lr,
            'stage': stage,
            "best_f": best_f1
        }
        if best_f1 < val_f1:
            save_ckpt(state, best_f1 < val_f1, model_save_dir)
            print('save best')
        else:
            save_ckpt(state, False, model_save_dir)
        best_f1 = max(best_f1, val_f1)

        if epoch in config.stage_epoch:
            stage += 1
            lr /= config.lr_decay
            #            best_w = os.path.join(model_save_dir, config.best_w)
            #            model.load_state_dict(torch.load(best_w)['state_dict'])
            print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
            utils.adjust_learning_rate(optimizer, lr)
예제 #15
0
    # Dumping of state was done before epoch callback, so do that now (model is loaded)
    baseline.epoch_callback(model, epoch)

    print("Resuming after epoch {}".format(epoch))
    epoch_start = epoch + 1
    step = 0

    # Evaluate on held-out set
    val_dataset = TSP.make_dataset(filename=opts.val_dataset,
                                   batch_size=opts.batch_size,
                                   num_samples=opts.val_size,
                                   neighbors=opts.neighbors,
                                   knn_strat=opts.knn_strat,
                                   supervised=True)
    avg_reward, avg_opt_gap = validate(model, val_dataset, problem, opts)
    tb_logger.log_value('val_ft/avg_reward', avg_reward, step)
    tb_logger.log_value('val_ft/opt_gap', avg_opt_gap, step)

    if opts.ft_strategy == "active":
        # Active search: finetune on the test set
        train_dataset = baseline.wrap_dataset(val_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=opts.batch_size,
                                      shuffle=True,
                                      num_workers=opts.num_workers)

    elif opts.ft_strategy == "fixed":
        # Fixed finetuning: finetune on a fixed training set
        train_dataset = baseline.wrap_dataset(
            problem.make_dataset(min_size=opts.min_size,
                                 max_size=opts.max_size,
예제 #16
0
class PlotLogger:
    def __init__(self, path):
        self.plot_instance = Logger(path)

    def log_value(self, name, value, step):
        self.plot_instance.log_value(name, value, step)
예제 #17
0
        G_BA.load_state_dict(torch.load(G_BA_file))
        
    networks = [D_B, D_A, G_AB, G_BA]
    
    optimizer_D_B = optim.Adam(D_B.parameters(), lr=lr_1)
    optimizer_D_A = optim.Adam(D_A.parameters(), lr=lr_1)
    optimizer_G_AB = optim.Adam(G_AB.parameters(), lr=lr_1)
    optimizer_G_BA = optim.Adam(G_BA.parameters(), lr=lr_1)
    optimizers = [optimizer_G_AB, optimizer_G_BA, optimizer_D_B, optimizer_D_A]
    
    pool_A = ImagePool(pool_size=50)
    pool_B = ImagePool(pool_size=50)
    pools = [pool_A, pool_B]
    
    for epoch in range(starting_epoch, nb_epochs + 1):
        losses = train(epoch, loader, networks, optimizers, pools, max_steps=max_steps, verbose=False)
        train_logger.log_value('D_B loss', losses[0], epoch)
        train_logger.log_value('D_A loss', losses[1], epoch)
        train_logger.log_value('G_AB loss', losses[2], epoch)
        train_logger.log_value('G_BA loss', losses[3], epoch)
        total_loss = sum(losses)
        print("\nLoss at epoch n.{} : {}".format(epoch, total_loss))
        D_B_file = 'models/' + experiment_name + '/D_B_' + str(epoch) + '.pth'
        D_A_file = 'models/' + experiment_name + '/D_A_' + str(epoch) + '.pth'
        G_AB_file = 'models/' + experiment_name + '/G_AB_' + str(epoch) + '.pth'
        G_BA_file = 'models/' + experiment_name + '/G_BA_' + str(epoch) + '.pth'
        torch.save(D_B.state_dict(), D_B_file)
        torch.save(D_A.state_dict(), D_A_file)
        torch.save(G_AB.state_dict(), G_AB_file)
        torch.save(G_BA.state_dict(), G_BA_file)
예제 #18
0
class TrainNetwork:
    def __init__(self,
                 batch_size=64,
                 image_size=64,
                 load_model=None,
                 iterations_start=0,
                 seed=None,
                 epochs=10000,
                 lr_decay_iter=250000,
                 lr=1e-3,
                 betas=(0.9, 0.999),
                 eps=1e-8,
                 weight_decay=1e-3,
                 weighting_factor=0.5,
                 regression=False,
                 loss='MultiLabelSoftMarginLoss',
                 network='ColorfulImageColorization',
                 convert_on_gpu=False,
                 do_not_log=False):
        # hyperparameters
        self.batch_size = batch_size
        self.image_size = image_size
        self.iterations = iterations_start
        self.seed = np.random.randint(0, 10000) if seed is None else seed
        self.epochs = epochs
        self.lr_decay_iter = lr_decay_iter
        self.lr0 = lr
        self.betas = betas
        self.eps = eps
        self.weight_decay = weight_decay
        self.weighting_factor = weighting_factor
        self.lr = self.calc_learning_rate()
        self.regression = regression
        self.loss = loss
        self.model_name = '{date:%Y_%m_%d__%H_%M_%S}_{net}_{loss}_{dataset}'.format(
            date=datetime.datetime.now(),
            net=network,
            loss=self.loss,
            dataset=dataset)
        torch.manual_seed(self.seed)
        self.convert_on_gpu = convert_on_gpu

        # tensorboard
        if not do_not_log:
            self.logger = Logger(logs_path + self.model_name)
            self.log_hyperparameter()

        # model, loss, optimizer
        assert network in [
            'DeepKoalarization', 'CheapConvNet', 'ColorfulImageColorization',
            'DeepKoalarizationNorm'
        ]

        if self.regression:
            out_channels = 2
        else:
            out_channels = int((256 / grid_size)**2)

        if network == 'DeepKoalarization':
            self.model = DeepKoalarization(
                out_channels=out_channels,
                to_rgb=(self.loss == 'PerceptualLoss'))
        elif network == 'DeepKoalarizationNorm':
            self.model = DeepKoalarizationNorm(
                out_channels=out_channels,
                to_rgb=(self.loss == 'PerceptualLoss'))
        elif network == 'CheapConvNet':
            assert self.loss != 'PerceptualLoss'
            self.model = CheapConvNet(out_channels=out_channels)
        elif network == 'ColorfulImageColorization':
            self.model = ColorfulImageColorization(
                out_channels=out_channels,
                to_rgb=(self.loss == 'PerceptualLoss'))

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.lr,
                                    betas=self.betas,
                                    eps=self.eps,
                                    weight_decay=self.weight_decay)

        assert loss in [
            'PerceptualLoss', 'MSELoss', 'MultiLabelSoftMarginLoss',
            'BCEWithLogitsLoss', 'MultinomialCrossEntropyLoss'
        ]

        if loss == 'PerceptualLoss':
            self.loss_fn = PerceptualLoss()
        elif loss == 'MSELoss':
            assert self.regression
            self.loss_fn = nn.MSELoss()
        elif loss == 'MultiLabelSoftMarginLoss':
            assert not self.regression
            w = torch.load(images_path +
                           'classification_weights_{}_{}.pth'.format(
                               grid_size, self.weighting_factor))
            self.loss_fn = nn.MultiLabelSoftMarginLoss(weight=w.view(-1, 1, 1))
        elif loss == 'BCEWithLogitsLoss':
            assert not self.regression
            w = torch.load(images_path +
                           'classification_weights_{}_{}.pth'.format(
                               grid_size, self.weighting_factor))
            self.loss_fn = nn.BCEWithLogitsLoss(weight=w.view(-1, 1, 1))
        elif loss == 'MultinomialCrossEntropyLoss':
            assert not self.regression
            w = torch.load(images_path +
                           'classification_weights_{}_{}.pth'.format(
                               grid_size, self.weighting_factor))
            self.loss_fn = MultinomialCrossEntropyLoss(weights=w)

        # load model
        if load_model is not None:
            # Load pre learned AlexNet with changed number of output classes
            state_dict = torch.load(trained_models_path + load_model,
                                    map_location='cpu')
            self.model.load_state_dict(state_dict['model'])
            self.optimizer.load_state_dict(state_dict['optimizer'])
            self.adjust_learning_rate()

        # Use cuda if available
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model.cuda()
            self.loss_fn.cuda()

            if load_model is not None:
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.cuda()

        # Load dataset
        kwargs = {'num_workers': 8, 'pin_memory': True} if self.cuda else {}
        self.train_loader = torch.utils.data.DataLoader(ColorizationDataset(
            images_path,
            train=True,
            size=(self.image_size, self.image_size),
            target_rgb=(loss == 'PerceptualLoss'),
            convert_to_categorical=(not self.regression),
            do_not_convert=convert_on_gpu),
                                                        batch_size=batch_size,
                                                        shuffle=True,
                                                        **kwargs)
        kwargs = {'num_workers': 1, 'pin_memory': True} if self.cuda else {}
        self.test_loader = torch.utils.data.DataLoader(ColorizationDataset(
            images_path,
            train=False,
            size=(self.image_size, self.image_size),
            target_rgb=(loss == 'PerceptualLoss'),
            convert_to_categorical=(not self.regression),
            do_not_convert=convert_on_gpu),
                                                       batch_size=8,
                                                       drop_last=True,
                                                       shuffle=True,
                                                       **kwargs)
        self.test_iterator = iter(self.test_loader)

    def calc_learning_rate(self):
        """
        Reduce the learning rate by factor 0.5 every lr_decay_iter
        :return: None
        """
        lr = self.lr0 * (0.1**(self.iterations // self.lr_decay_iter))
        return lr

    def adjust_learning_rate(self):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.lr

    def reduce_learning_rate(self):
        lr = self.calc_learning_rate()

        if abs(lr - self.lr) > 1e-7:
            self.lr = lr
            self.adjust_learning_rate()

    def train(self):
        """
        Train the model for one epoch and save the result as a .pth file
        :return: None
        """
        self.model.train()

        for epoch in range(1, self.epochs + 1):
            train_loss_epoche = 0
            batch_start_time = time.clock()
            batch_idx = 0
            for batch_idx, (data, target) in enumerate(self.train_loader):
                self.reduce_learning_rate()
                train_loss_epoche += self.train_one_iter(
                    data, target, epoch, batch_idx)
                self.iterations += 1
                print('Batch ' + str(batch_idx + 1) + ' took ' +
                      str(time.clock() - batch_start_time) + ' seconds')
                batch_start_time = time.clock()

            # Print information about current epoch
            train_loss_epoche /= (batch_idx + 1)
            print('Train Epoch: {} \tAverage loss: {:.6f}'.format(
                epoch, train_loss_epoche))

    def train_one_iter(self, data, target, epoch, batch_idx):

        if self.cuda:
            data = data.cuda()
            target = target.cuda()

        if self.convert_on_gpu:
            lab = pdc.rgb2lab(data.float() / 255)
            data, target = torch.split(lab, [1, 2], dim=1)
            if not self.regression:
                target = conversion_batch(target)

        # Optimize using backpropagation
        self.optimizer.zero_grad()
        output = self.model(data)

        loss = self.loss_fn(output, target)
        loss.backward()
        self.optimizer.step()

        # Print information about current step
        print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
            epoch, batch_idx + 1, len(self.train_loader), loss.item()))
        if self.iterations % 5 == 0:
            # log loss
            test_data, test_target = self.get_next_test_batch()
            self.model.eval()
            test_output = self.model(test_data)
            self.model.train()
            test_loss = self.loss_fn(test_output, test_target)
            self.log_scalars(self.iterations, loss, test_loss)

            if self.iterations % 50 == 0:
                # log images
                self.log_images(self.iterations, test_data, test_target,
                                test_output, data, target, output)
        if self.iterations % 1000 == 0 and self.iterations > 0:
            # Save snapshot
            model_name = self.model_name + '_iter{}'.format(self.iterations)
            torch.save(
                {
                    'model': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict()
                }, trained_models_path + '{}.pth'.format(model_name))
        if self.iterations % 600 == 0:
            # log stuff
            self.log_values_gradients(self.iterations)

        return loss.item()

    def log_hyperparameter(self):
        info = {
            'batch_size': self.batch_size,
            'image_size': self.image_size,
            'seed': self.seed,
            'epochs': self.epochs,
            'learning_decay_iter': self.lr_decay_iter,
            'learning_rate_0': self.lr0,
            'betas[0]': self.betas[0],
            'betas[1]': self.betas[1],
            'eps_optimizer': self.eps,
            'weight_decay_optimizer': self.weight_decay,
            'weighting_factor': self.weighting_factor,
            'regression': self.regression,
            'convert_on_gpu': self.convert_on_gpu
        }

        for tag, value in info.items():
            self.logger.log_value(tag, value, 0)

    def log_scalars(self, step, train_loss, test_loss):
        # adapted from https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/04-utils/tensorboard/main.py
        # 1. Log scalar values (scalar summary)
        info = {
            'train_loss': train_loss.item(),
            'test_loss': test_loss.item(),
            'learning_rate': self.lr
        }

        for tag, value in info.items():
            self.logger.log_value(tag, value, step)

    def log_images(self, step, test_data, test_target, test_output, train_data,
                   train_target, train_output):
        # 3. Log test images (image summary)
        num_images = 1
        test_original = self.convert_to_images(test_data[:num_images],
                                               test_target[:num_images],
                                               is_target=True)
        test_colorized = self.convert_to_images(test_data[:num_images],
                                                test_output[:num_images],
                                                is_target=False)
        train_original = self.convert_to_images(train_data[:num_images],
                                                train_target[:num_images],
                                                is_target=True)
        train_colorized = self.convert_to_images(train_data[:num_images],
                                                 train_output[:num_images],
                                                 is_target=False)
        info = {
            'test colorized': test_colorized,
            'test original': test_original,
            'train colorized': train_colorized,
            'train original': train_original
        }

        for tag, images in info.items():
            self.logger.log_images(tag, images, step)

    def log_values_gradients(self, step):
        # adapted from https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/04-utils/tensorboard/main.py
        # 2. Log values and gradients of the parameters (histogram summary)
        for tag, value in self.model.named_parameters():
            if 'feature_extractor' not in tag:
                tag = tag.replace('.', '/')
                self.logger.log_histogram(tag, value.data.cpu().numpy(), step)
                self.logger.log_histogram(tag + '/grad',
                                          value.grad.data.cpu().numpy(), step)

    def convert_to_images(self, data, output_or_target, is_target, t=0.38):
        if self.loss == 'PerceptualLoss':
            return output_or_target.detach().cpu().permute(
                0, 2, 3, 1).numpy().astype(np.float64)
        else:
            return generate_images_numpy(data,
                                         output_or_target,
                                         is_target,
                                         regression=self.regression,
                                         t=t)

    def get_next_test_batch(self):
        try:
            data, target = next(self.test_iterator)
        except StopIteration:
            self.test_iterator = iter(self.test_loader)
            data, target = next(self.test_iterator)

        if self.cuda:
            data = data.cuda()
            target = target.cuda()

        if self.convert_on_gpu:
            lab = pdc.rgb2lab(data.float() / 255)
            data, target = torch.split(lab, [1, 2], dim=1)
            if not self.regression:
                target = conversion_batch(target)

        return data, target
예제 #19
0
def trainModel(model, trainData, validData, dataset, optim):
    logger = Logger(os.path.join(opt.save_path, 'tb'))
    iterations = 0

    print(model)
    model.train()
    # Define criterion of each GPU.
    criterion = NMTCriterion(dataset['dicts']['tgt'].size())
    start_time = time.time()

    for epoch in range(opt.start_epoch, opt.epochs + 1):
        print('')

        #  (1) train for one epoch on the training set
        if opt.extra_shuffle and epoch > opt.curriculum:
            trainData.shuffle()
        # Shuffle mini batch order.
        batchOrder = torch.randperm(len(trainData))

        total_loss, total_words, total_num_correct = 0, 0, 0
        report_loss, report_tgt_words = 0, 0
        report_src_words, report_num_correct = 0, 0
        start = time.time()
        for i in range(len(trainData)):
            iterations += 1

            batchIdx = batchOrder[i] if epoch > opt.curriculum else i
            # Exclude original indices.
            batch = trainData[batchIdx][:-1]

            model.zero_grad()
            outputs = model(batch)
            # Exclude <s> from targets.
            targets = batch[1][1:]
            loss, gradOutput, num_correct = memoryEfficientLoss(
                outputs, targets, model.generator, criterion)

            outputs.backward(gradOutput)

            # Update the parameters.
            optim.step()

            num_words = targets.data.ne(onmt.Constants.PAD).sum()
            report_loss += loss
            report_num_correct += num_correct
            report_tgt_words += num_words
            report_src_words += batch[0][1].data.sum()
            total_loss += loss
            total_num_correct += num_correct
            total_words += num_words
            if iterations % opt.log_interval == -1 % opt.log_interval:
                print((
                    "Epoch %d, %d/%d; acc: %.2f; ppl: %.2f; %.0f src tok/s; %.0f tgt tok/s; %.0fs elapsed"
                ) % (epoch, i + 1, len(trainData),
                     report_num_correct / report_tgt_words * 100.0,
                     math.exp(
                         report_loss / report_tgt_words), report_src_words /
                     (time.time() - start), report_tgt_words /
                     (time.time() - start), time.time() - start_time))
                # log to tensorboard
                logger.log_value("word_acc",
                                 float(report_num_correct) /
                                 float(report_tgt_words),
                                 step=iterations)
                logger.log_value("ppl",
                                 math.exp(report_loss / report_tgt_words),
                                 step=iterations)

                report_loss, report_tgt_words = 0, 0
                report_src_words, report_num_correct = 0, 0
                start = time.time()
        train_loss, train_acc = total_loss / total_words, total_num_correct / total_words

        train_ppl = math.exp(min(train_loss, 100))
        print('Train perplexity: %g' % train_ppl)
        print('Train word accuracy: %g' % (train_acc * 100))

        #  (2) evaluate on the validation set
        valid_loss, valid_acc = eval(model, criterion, validData)
        valid_ppl = math.exp(min(valid_loss, 100))
        print('Validation perplexity: %g' % valid_ppl)
        print('Validation word accuracy: %g' % (valid_acc * 100))

        #  (3) update the learning rate
        optim.updateLearningRate(valid_ppl, epoch)

        model_state_dict = (model.module.state_dict()
                            if len(opt.gpus) > 1 else model.state_dict())
        model_state_dict = {
            k: v
            for k, v in model_state_dict.items() if 'generator' not in k
        }
        generator_state_dict = (model.generator.module.state_dict()
                                if len(opt.gpus) > 1 else
                                model.generator.state_dict())
        #  (4) drop a checkpoint
        checkpoint = {
            'model': model_state_dict,
            'generator': generator_state_dict,
            'dicts': dataset['dicts'],
            'opt': opt,
            'epoch': epoch,
            'optim': optim
        }
        if epoch % 5 == 0:
            torch.save(
                checkpoint,
                os.path.join(opt.save_path,
                             'm_%d_acc_%.2f.pt' % (epoch, 100.0 * valid_acc)))
예제 #20
0
class Logger(object):

	def __init__(self, log_dir, label, titles, append_steps=1):
		"""
		log_dir      : str, directory where all the logs will be written.
		label        : str, root filename for the logs. It shouldn't contain an extension, such as .txt
		titles       : list, title for each log attribute.
		append_steps : int, 
		"""

		self.log_dir = log_dir
		self.label = label
		self.titles = titles
		self.append_steps = append_steps

		self.logs = {} # all title-log pairs that will be traced for this instance
		self.meters = {}
		for t in titles:
			self.logs[t] = []
			self.meters[t] = AverageMeter()

		if not os.path.exists(self.log_dir): os.makedirs(self.log_dir)
		self.tb_logger = TBLogger(self.log_dir)
		self.f_txt = open(os.path.join(self.log_dir, '{}.txt'.format(self.label)), 'w')

	def flush(self):
		self.save_as_arrays()
		self.save_as_figures()

	def close(self):
		self.flush()
		self.f_txt.close()

	def update(self, values, step):
		"""
		Adds a new log value for each title, also updates corresponding average meters.
		If step is multiple of append_steps, then self.append is called.

		values : list, must be of the same size as self.titles.
		step   : int, a step number
		"""
		assert len(self.titles) == len(values)

		for t, v in zip(self.titles, values):
			self.meters[t].update(v, 1)

		if step % self.append_steps == 0:
			values = [m.avg for m in self.meters.values()]
			self.append(values, step)

	def append(self, values, step):
		"""
		Adds a new log value for each title.

		values : list, must be of the same size as self.titles.
		step   : int, a step number
		"""
		assert len(self.titles) == len(values)
		
		step_log = OrderedDict()
		step_log['step'] = str(step)
		step_log['time'] = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S")

		for t, v in zip(self.titles, values):
			self.logs[t].append(v)
			step_log[t] = v
			self.tb_logger.log_value(t, v, step)

		json.dump(step_log, self.f_txt, indent=4)
		self.f_txt.write('\n')
		self.f_txt.flush()

	def save_as_arrays(self):
		"""
		Converts all logs to numpy arrays and saves them into self.log_dir.
		"""
		arrays = {}
		for t, v in self.logs.items():
			v = np.array(v)
			arrays[t] = v

		np.savez(os.path.join(self.log_dir, '{}.npz'.format(self.label)), **arrays)

	def save_as_figures(self):
		"""
		First, converts all logs to numpy arrays, then plots them using matplotlib. Finally, saves the plots into self.log_dir.
		"""
		for t, v in self.logs.items():
			v = np.array(v)

			fig = plt.figure(dpi=400)
			ax = fig.add_subplot(111)
			ax.plot(v)
			ax.set_title(t)
			ax.grid(True)
			fig.savefig(
				os.path.join(self.log_dir, '{}_{}.png'.format(self.label, t)),
				bbox_inches='tight' )
			plt.close()
예제 #21
0
def train_cv(input_directory, output_directory):
    # model
    # 模型保存文件夹
    model_save_dir = '%s/%s_%s' % (
        config.ckpt, config.model_name + "_cv", time.strftime("%Y%m%d%H%M")
    )  #'%s/%s_%s' % (config.ckpt, args.model_name+"_cv", time.strftime("%Y%m%d%H%M"))
    for fold in range(config.kfold):
        print("***************************fold : {}***********************".
              format(fold))
        model = getattr(models, config.model_name)(fold=fold)
        # if args.ckpt and not args.resume:
        #     state = torch.load(args.ckpt, map_location='cpu')
        #     model.load_state_dict(state['state_dict'])
        #     print('train with pretrained weight val_f1', state['f1'])

        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, config.num_classes)

        #2019/11/11
        #save dense/fc weight for pretrain 55 classes
        # model = MyModel()
        # num_ftrs = model.classifier.out_features
        # model.fc = nn.Linear(55, config.num_classes)

        model = model.to(device)
        # data
        train_dataset = ECGDataset(data_path=config.train_data_cv.format(fold),
                                   data_dir=input_directory,
                                   train=True)

        train_dataloader = DataLoader(train_dataset,
                                      batch_size=config.batch_size,
                                      shuffle=True,
                                      drop_last=True,
                                      num_workers=6)

        val_dataset = ECGDataset(data_path=config.train_data_cv.format(fold),
                                 data_dir=input_directory,
                                 train=False)

        val_dataloader = DataLoader(val_dataset,
                                    batch_size=config.batch_size,
                                    drop_last=True,
                                    num_workers=4)

        print("fold_{}_train_datasize".format(fold), len(train_dataset),
              "fold_{}_val_datasize".format(fold), len(val_dataset))
        # optimizer and loss
        optimizer = radam.RAdam(
            model.parameters(),
            lr=config.lr)  #optim.Adam(model.parameters(), lr=config.lr)
        w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
        criterion = utils.WeightedMultilabel(w)  ## utils.FocalLoss() #
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         'max',
                                                         verbose=True,
                                                         factor=0.1,
                                                         patience=5,
                                                         min_lr=1e-06,
                                                         eps=1e-08)

        # if args.ex: model_save_dir += args.ex
        # best_f1 = -1
        # lr = config.lr
        # start_epoch = 1
        # stage = 1

        best_f1 = -1
        best_cm = -1
        lr = config.lr
        start_epoch = 1
        stage = 1
        # 从上一个断点,继续训练
        #         if args.resume:
        #             if os.path.exists(args.ckpt):  # 这里是存放权重的目录
        #                 model_save_dir = args.ckpt
        #                 current_w = torch.load(os.path.join(args.ckpt, config.current_w))
        #                 best_w = torch.load(os.path.join(model_save_dir, config.best_w))
        #                 best_f1 = best_w['loss']
        #                 start_epoch = current_w['epoch'] + 1
        #                 lr = current_w['lr']
        #                 stage = current_w['stage']
        #                 model.load_state_dict(current_w['state_dict'])
        #                 # 如果中断点恰好为转换stage的点
        #                 if start_epoch - 1 in config.stage_epoch:
        #                     stage += 1
        #                     lr /= config.lr_decay
        #                     utils.adjust_learning_rate(optimizer, lr)
        #                     model.load_state_dict(best_w['state_dict'])
        #                 print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))
        logger = Logger(logdir=model_save_dir, flush_secs=2)
        # =========>开始训练<=========
        for epoch in range(start_epoch, config.max_epoch + 1):
            since = time.time()
            train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch(
                model,
                optimizer,
                criterion,
                train_dataloader,
                show_interval=100)
            val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch(
                model, criterion, val_dataloader)

            # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100)
            # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader)

            print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \
                    val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n'
                  % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \
                    val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since)))

            logger.log_value('fold{}_train_loss'.format(fold),
                             train_loss,
                             step=epoch)
            logger.log_value('fold{}_train_f1'.format(fold),
                             train_f1,
                             step=epoch)
            logger.log_value('fold{}_val_loss'.format(fold),
                             val_loss,
                             step=epoch)
            logger.log_value('fold{}_val_f1'.format(fold), val_f1, step=epoch)
            state = {
                "state_dict": model.state_dict(),
                "epoch": epoch,
                "loss": val_loss,
                'f1': val_f1,
                'lr': lr,
                'stage': stage
            }

            save_ckpt_cv(state, best_cm < val_cm, model_save_dir, fold,
                         output_directory)
            best_cm = max(best_cm, val_cm)

            scheduler.step(val_cm)
            # scheduler.step()

            if val_cm < best_cm:
                epoch_cum += 1
            else:
                epoch_cum = 0

            # save_ckpt_cv(state, best_f1 < val_f1, model_save_dir,fold)
            # best_f1 = max(best_f1, val_f1)

            # if val_f1 < best_f1:
            #     epoch_cum += 1
            # else:
            #     epoch_cum = 0

            # if epoch in config.stage_epoch:
            # if epoch_cum == 5:
            #     stage += 1
            #     lr /= config.lr_decay
            #     if lr < 1e-6:
            #         lr = 1e-6
            #         print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            #     best_w = os.path.join(model_save_dir, config.best_w_cv.format(fold))
            #     model.load_state_dict(torch.load(best_w)['state_dict'])
            #     print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
            #     utils.adjust_learning_rate(optimizer, lr)

            # elif epoch_cum >= 12:
            #     print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            #     break

            if epoch_cum >= 12:
                print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
                break
예제 #22
0
def train(input_directory, output_directory):
    # model
    model = getattr(models, config.model_name)()

    # if args.ckpt and not args.resume:
    #     state = torch.load(args.ckpt, map_location='cpu')
    #     model.load_state_dict(state['state_dict'])
    #     print('train with pretrained weight val_f1', state['f1'])

    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, config.num_classes)

    model = model.to(device)
    # data
    train_dataset = ECGDataset(data_path=config.train_data,
                               data_dir=input_directory,
                               train=True)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=6)
    val_dataset = ECGDataset(data_path=config.train_data,
                             data_dir=input_directory,
                             train=False)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=config.batch_size,
                                num_workers=4)

    print("train_datasize", len(train_dataset), "val_datasize",
          len(val_dataset))
    # optimizer and loss
    #optimizer = optim.Adam(model.parameters(), lr=config.lr)
    optimizer = radam.RAdam(model.parameters(),
                            lr=config.lr,
                            weight_decay=1e-4)  #config.lr
    #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False)
    w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
    criterion = utils.WeightedMultilabel(w)  ##   # utils.FocalLoss() #

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        'max',
        verbose=True,
        factor=0.1,
        patience=5,
        min_lr=1e-06,
        eps=1e-08)  #CosineAnnealingLR  CosineAnnealingWithRestartsLR
    #scheduler = pytorchtools.CosineAnnealingWithRestartsLR(optimizer,T_max=30, T_mult = 1.2, eta_min=1e-6)

    # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True)
    # scheduler = pytorchtools.CosineAnnealingLR_with_Restart(optimizer, T_max=12, T_mult=1, model=model, out_dir='./snapshot',take_snapshot=True, eta_min=1e-9)

    # 模型保存文件夹
    model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name,
                                   time.strftime("%Y%m%d%H%M"))

    # if args.ex: model_save_dir += args.ex

    best_f1 = -1
    best_cm = -1
    lr = config.lr
    start_epoch = 1
    stage = 1

    # 从上一个断点,继续训练
    # if args.resume:
    #     if os.path.exists(args.ckpt):  # 这里是存放权重的目录
    #         model_save_dir = args.ckpt
    #         current_w = torch.load(os.path.join(args.ckpt, config.current_w))
    #         best_w = torch.load(os.path.join(model_save_dir, config.best_w))
    #         best_f1 = best_w['loss']
    #         start_epoch = current_w['epoch'] + 1
    #         lr = current_w['lr']
    #         stage = current_w['stage']
    #         model.load_state_dict(current_w['state_dict'])
    #         # 如果中断点恰好为转换stage的点
    #         if start_epoch - 1 in config.stage_epoch:
    #             stage += 1
    #             lr /= config.lr_decay
    #             utils.adjust_learning_rate(optimizer, lr)
    #             model.load_state_dict(best_w['state_dict'])
    #         print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))

    logger = Logger(logdir=model_save_dir, flush_secs=2)
    # =========>开始训练<=========
    for epoch in range(start_epoch, config.max_epoch + 1):
        since = time.time()
        train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch(
            model, optimizer, criterion, train_dataloader, show_interval=100)
        val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch(
            model, criterion, val_dataloader)

        # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100)
        # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader)

        print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \
                val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n'
              % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \
                val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since)))

        logger.log_value('train_loss', train_loss, step=epoch)
        logger.log_value('train_f1', train_f1, step=epoch)
        logger.log_value('val_loss', val_loss, step=epoch)
        logger.log_value('val_f1', val_f1, step=epoch)
        state = {
            "state_dict": model.state_dict(),
            "epoch": epoch,
            "loss": val_loss,
            'f1': val_f1,
            'lr': lr,
            'stage': stage
        }

        save_ckpt(state, best_cm < val_cm, model_save_dir, output_directory)
        best_cm = max(best_cm, val_cm)

        scheduler.step(val_cm)
        # scheduler.step()

        if val_cm < best_cm:
            epoch_cum += 1
        else:
            epoch_cum = 0


#         # if epoch in config.stage_epoch:
#         if epoch_cum == 5:
#             stage += 1
#             lr /= config.lr_decay
#             if lr < 1e-6:
#                 lr = 1e-6
#                 print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
#             best_w = os.path.join(model_save_dir, config.best_w)
#             model.load_state_dict(torch.load(best_w)['state_dict'])
#             print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
#             utils.adjust_learning_rate(optimizer, lr)

#         elif epoch_cum >= 12:
#             print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
#             break

        if epoch_cum >= 12:
            print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            break
예제 #23
0
파일: logger.py 프로젝트: sxhxliang/BasicSR
class Logger(object):
    def __init__(self, opt):
        self.exp_name = opt['name']
        self.use_tb_logger = opt['use_tb_logger']
        self.opt = opt['logger']
        self.log_dir = opt['path']['log']
        # loss log file
        self.loss_log_path = os.path.join(self.log_dir, 'loss_log.txt')
        with open(self.loss_log_path, "a") as log_file:
            log_file.write('=============== Time: ' + get_timestamp() +
                           ' =============\n')
            log_file.write(
                '================ Training Losses ================\n')
        # val results log file
        self.val_log_path = os.path.join(self.log_dir, 'val_log.txt')
        with open(self.val_log_path, "a") as log_file:
            log_file.write('================ Time: ' + get_timestamp() +
                           ' ===============\n')
            log_file.write(
                '================ Validation Results ================\n')
        if self.use_tb_logger and 'debug' not in self.exp_name:
            from tensorboard_logger import Logger as TensorboardLogger
            self.tb_logger = TensorboardLogger('../tb_logger/' + self.exp_name)

    # def print_format_results(self, mode, rlt):
    #     epoch = rlt.pop('epoch')
    #     iters = rlt.pop('iters')
    #     time = rlt.pop('time')
    #     model = rlt.pop('model')
    #     message = '<epoch:{:3d}, iter:{:9,d}, time: {:.2f}> '.format(epoch, iters, time)
    #     if mode == 'train':
    #         if 'gan' in model: # srgan, sftgan, sftgan_acd
    #             loss_g_pixel = rlt['loss_g_pixel']  if 'loss_g_pixel' in rlt else -1
    #             loss_g_fea = rlt['loss_g_fea']  if 'loss_g_fea' in rlt else -1
    #             loss_g_gan = rlt['loss_g_gan']  if 'loss_g_gan' in rlt else -1
    #             loss_d_real = rlt['loss_d_real']  if 'loss_d_real' in rlt else -1
    #             loss_d_fake = rlt['loss_d_fake'] if 'loss_d_fake' in rlt else -1
    #             D_out_real = rlt['D_out_real']  if 'D_out_real' in rlt else -1
    #             D_out_fake = rlt['D_out_fake']  if 'D_out_fake' in rlt else -1
    #             lr = rlt['lr']

    #             # tensorboard logger - common
    #             if self.use_tb_logger and 'debug' not in self.exp_name:
    #                 if loss_g_pixel != -1 :
    #                     self.tb_logger.log_value('loss_g_pixel', loss_g_pixel, iters)
    #                 if loss_g_fea != -1:
    #                     self.tb_logger.log_value('loss_g_fea', loss_g_fea, iters)
    #                 self.tb_logger.log_value('loss_g_gan', loss_g_gan, iters)
    #                 self.tb_logger.log_value('loss_d_real', loss_d_real, iters)
    #                 self.tb_logger.log_value('loss_d_fake', loss_d_fake, iters)

    #             if 'loss_d_gp' in rlt: # wgan-gp
    #                 loss_d_gp = rlt['loss_d_gp']
    #                 format_str = ('<loss_G: pixel: {:.2e}, fea: {:.2e}, gan: {:.2e}><loss_D: '
    #                     'real: {:.2e} , fake: {:.2e}, gp: {:.2e}><Dout: G: {:.2f}, D: {:.2f}> '
    #                     'lr: {:.2e}'.format(loss_g_pixel, loss_g_fea, loss_g_gan, loss_d_real, \
    #                     loss_d_fake, loss_d_gp, D_out_real, D_out_fake, lr))
    #                 # tensorboard logger - wgan-gp
    #                 if self.use_tb_logger and 'debug' not in self.exp_name:
    #                     self.tb_logger.log_value('loss_d_gp', loss_d_gp, iters)
    #                     self.tb_logger.log_value('Wasserstein_dist', D_out_real - D_out_fake, iters)

    #             else:
    #                 format_str = ('<loss_G: pixel: {:.2e}, fea: {:.2e}, gan: {:.2e}><loss_D: '
    #                     'real: {:.2e} , fake: {:.2e}><Dout: G: {:.2f}, D: {:.2f}> '
    #                     'lr: {:.2e}'.format(loss_g_pixel, loss_g_fea, loss_g_gan, loss_d_real, \
    #                     loss_d_fake, D_out_real, D_out_fake, lr))

    #                 # tensorboard logger - vanilla gan | lsgan
    #                 if self.use_tb_logger and 'debug' not in self.exp_name:
    #                     self.tb_logger.log_value('D_out_real', D_out_real, iters)
    #                     self.tb_logger.log_value('D_out_fake', D_out_fake, iters)

    #         else: # sr and others
    #             loss_pixel = rlt['loss_pixel']  if 'loss_pixel' in rlt else -1
    #             lr = rlt['lr']
    #             format_str = '<loss: {:.2e}> lr: {:.2e}'.format(loss_pixel, lr)
    #             # tensorboard logger
    #             if self.use_tb_logger and 'debug' not in self.exp_name:
    #                 self.tb_logger.log_value('loss_pixel', loss_pixel, iters)

    #         message += format_str
    #     else:
    #         for label, value in rlt.items():
    #             message += '%s: %.4e ' % (label, value)
    #             # tensorboard logger
    #             if self.use_tb_logger and 'debug' not in self.exp_name:
    #                 self.tb_logger.log_value(label, value, iters)

    #     # print in console
    #     print(message)
    #     # write in log file
    #     if mode == 'train':
    #         with open(self.loss_log_path, "a") as log_file:
    #             log_file.write('%s\n' % message)
    #     elif mode == 'val':
    #         with open(self.val_log_path, "a") as log_file:
    #             log_file.write('%s\n' % message)

    def print_format_results(self, mode, rlt):
        epoch = rlt.pop('epoch')
        iters = rlt.pop('iters')
        time = rlt.pop('time')
        model = rlt.pop('model')
        if 'lr' in rlt:
            lr = rlt.pop('lr')
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}, lr:{:.1e}> '.format(
                epoch, iters, time, lr)
        else:
            message = '<epoch:{:3d}, iter:{:8,d}, time:{:.2f}> '.format(
                epoch, iters, time)

        for label, value in rlt.items():
            message += '%s: %.2e ' % (label, value)
            # tensorboard logger
            if self.use_tb_logger and 'debug' not in self.exp_name:
                self.tb_logger.log_value(label, value, iters)

        # print in console
        print(message)
        # write in log file
        if mode == 'train':
            with open(self.loss_log_path, "a") as log_file:
                log_file.write('%s\n' % message)
        elif mode == 'val':
            with open(self.val_log_path, "a") as log_file:
                log_file.write('%s\n' % message)
예제 #24
0
def main(args, net=None):
    global oldassignment

    datadir = get_data_dir(args.db)
    outputdir = get_output_dir(args.db)

    logger = None
    if args.tensorboard:
        # One should create folder for storing logs
        loggin_dir = os.path.join(outputdir, 'runs', 'DCC')
        if not os.path.exists(loggin_dir):
            os.makedirs(loggin_dir)
        loggin_dir = os.path.join(loggin_dir, '%s' % (args.id))
        if args.clean_log:
            remove_files_in_dir(loggin_dir)
        logger = Logger(loggin_dir)

    use_cuda = torch.cuda.is_available()

    # Set the seed for reproducing the results
    random.seed(args.manualSeed)
    np.random.seed(args.manualSeed)
    torch.manual_seed(args.manualSeed)
    if use_cuda:
        torch.cuda.manual_seed_all(args.manualSeed)
        torch.backends.cudnn.enabled = True
        cudnn.benchmark = True


    startepoch = 0
    kwargs = {'num_workers': 5, 'pin_memory': True} if use_cuda else {}

    # setting up dataset specific objects
    trainset = DCCPT_data(root=datadir, train=True, h5=args.h5)
    testset = DCCPT_data(root=datadir, train=False, h5=args.h5)
    numeval = len(trainset) + len(testset)

    # extracting training data from the pretrained.mat file
    data, labels, pairs, Z, sampweight = makeDCCinp(args)

    # For simplicity, I have created placeholder for each datasets and model
    load_pretraining = True if net is None else False
    if net is None:
        net = dp.load_predefined_extract_net(args)

    # reshaping data for some datasets
    if args.db == 'cmnist':
        data = data.reshape((-1, 1, 28, 28))
    elif args.db == 'ccoil100':
        data = data.reshape((-1, 3, 128, 128))
    elif args.db == 'cytf':
        data = data.reshape((-1, 3, 55, 55))
    elif args.db == 'cyale':
        data = data.reshape((-1, 1, 168, 192))

    totalset = torch.utils.data.ConcatDataset([trainset, testset])

    # computing and initializing the hyperparams
    _sigma1, _sigma2, _lambda, _delta, _delta1, _delta2, lmdb, lmdb_data = computeHyperParams(pairs, Z)
    oldassignment = np.zeros(len(pairs))
    stopping_threshold = int(math.ceil(cfg.STOPPING_CRITERION * float(len(pairs))))

    # Create dataset and random batch sampler for Finetuning stage
    trainset = DCCFT_data(pairs, data, sampweight)
    batch_sampler = DCCSampler(trainset, shuffle=True, batch_size=args.batchsize)

    # copying model params from Pretrained (SDAE) weights file
    if load_pretraining:
        load_weights(args, outputdir, net)


    # creating objects for loss functions, U's are initialized to Z here
    # Criterion1 corresponds to reconstruction loss
    criterion1 = DCCWeightedELoss(size_average=True)
    # Criterion2 corresponds to sum of pairwise and data loss terms
    criterion2 = DCCLoss(Z.shape[0], Z.shape[1], Z, size_average=True)

    if use_cuda:
        net.cuda()
        criterion1 = criterion1.cuda()
        criterion2 = criterion2.cuda()

    # setting up data loader for training and testing phase
    trainloader = torch.utils.data.DataLoader(trainset, batch_sampler=batch_sampler, **kwargs)
    testloader = torch.utils.data.DataLoader(totalset, batch_size=args.batchsize, shuffle=False, **kwargs)

    # setting up optimizer - the bias params should have twice the learning rate w.r.t. weights params
    bias_params = filter(lambda x: ('bias' in x[0]), net.named_parameters())
    bias_params = list(map(lambda x: x[1], bias_params))
    nonbias_params = filter(lambda x: ('bias' not in x[0]), net.named_parameters())
    nonbias_params = list(map(lambda x: x[1], nonbias_params))

    optimizer = optim.Adam([{'params': bias_params, 'lr': 2*args.lr},
                            {'params': nonbias_params},
                            {'params': criterion2.parameters(), 'lr': args.lr},
                            ], lr=args.lr, betas=(0.99, 0.999))

    # this is needed for WARM START
    if args.resume:
        filename = outputdir+'/FTcheckpoint_%d.pth.tar' % args.level
        if os.path.isfile(filename):
            print("==> loading checkpoint '{}'".format(filename))
            checkpoint = torch.load(filename)
            net.load_state_dict(checkpoint['state_dict'])
            criterion2.load_state_dict(checkpoint['criterion_state_dict'])
            startepoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])
            _sigma1 = checkpoint['sigma1']
            _sigma2 = checkpoint['sigma2']
            _lambda = checkpoint['lambda']
            _delta = checkpoint['delta']
            _delta1 = checkpoint['delta1']
            _delta2 = checkpoint['delta2']
        else:
            print("==> no checkpoint found at '{}'".format(filename))
            raise ValueError

    # This is the actual Algorithm
    flag = 0
    for epoch in range(startepoch, args.nepoch):
        if logger:
            logger.log_value('sigma1', _sigma1, epoch)
            logger.log_value('sigma2', _sigma2, epoch)
            logger.log_value('lambda', _lambda, epoch)

        train(trainloader, net, optimizer, criterion1, criterion2, epoch, use_cuda, _sigma1, _sigma2, _lambda, logger)
        Z, U, change_in_assign, assignment = test(testloader, net, criterion2, epoch, use_cuda, _delta, pairs, numeval, flag, logger)

        if flag:
            # As long as the change in label assignment < threshold, DCC continues to run.
            # Note: This condition is always met in the very first epoch after the flag is set.
            # This false criterion is overwritten by checking for the condition twice.
            if change_in_assign > stopping_threshold:
                flag += 1
            if flag == 4:
                break

        if((epoch+1) % args.M == 0):
            _sigma1 = max(_delta1, _sigma1 / 2)
            _sigma2 = max(_delta2, _sigma2 / 2)
            if _sigma2 == _delta2 and flag == 0:
                # Start checking for stopping criterion
                flag = 1

        # Save checkpoint
        index = (epoch // args.M) * args.M
        save_checkpoint({'epoch': epoch+1,
                         'state_dict': net.state_dict(),
                         'criterion_state_dict': criterion2.state_dict(),
                         'optimizer': optimizer.state_dict(),
                         'sigma1': _sigma1,
                         'sigma2': _sigma2,
                         'lambda': _lambda,
                         'delta': _delta,
                         'delta1': _delta1,
                         'delta2': _delta2,
                         }, index, filename=outputdir)

    output = {'Z': Z, 'U': U, 'gtlabels': labels, 'w': pairs, 'cluster':assignment}
    sio.savemat(os.path.join(outputdir, 'features'), output)
예제 #25
0
파일: train.py 프로젝트: rkashuka/BME49500
    avg_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        if cuda:
            data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()

        out = model(data)
        loss = loss_fun(out, target)

        loss.backward()
        optimizer.step()

        avg_loss = 0.9 * avg_loss + 0.1 * loss.item()

    log_train.log_value('loss', avg_loss, epoch)
    print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, avg_loss))

    torch.save(model.state_dict(), 'colornet_params.pkl')

    if epoch % 50 == 0:
        torch.save(model.state_dict(), './param_backup/colornet_params_' + str(epoch) + '.pkl')

    with torch.no_grad():
        avg_loss = 0.0
        for batch_idx, (data, target) in enumerate(test_loader):
            if cuda:
                data, target = data.cuda(), target.cuda()

            out = model(data)