Exemplo n.º 1
0
def main():

    global best_prec1
    global writer

    # create model, mark it to run on the GPU
    model = LArFlowUResNet(inplanes=22,
                           input_channels=1,
                           num_classes=2,
                           showsizes=False,
                           use_visi=USE_VISI)
    if GPUMODE:
        if USE_DATA_PARALLEL:
            model.encoder = nn.DataParallel(
                model.encoder, device_ids=DEVICE_IDS)  # distribute
            model.decoder1 = nn.DataParallel(
                model.decoder1, device_ids=DEVICE_IDS_2)  # distribute
            model.decoder2 = nn.DataParallel(
                model.decoder2, device_ids=DEVICE_IDS_2)  # distribute
            model.encoder.cuda(DEVICE_IDS[0])
            model.decoder1.cuda(DEVICE_IDS_2[0])
            model.decoder2.cuda(DEVICE_IDS_2[0])
        else:
            model.cuda(DEVICE_IDS[0])
    # Resume training option
    if RESUME_FROM_CHECKPOINT:
        print "RESUMING FROM CHECKPOINT FILE ", CHECKPOINT_FILE
        checkpoint = torch.load(
            CHECKPOINT_FILE,
            map_location=CHECKPOINT_MAP_LOCATIONS)  # load weights to gpuid
        best_prec1 = checkpoint["best_prec1"]
        #best_prec1 = 0.158
        model.load_state_dict(checkpoint["state_dict"])

    ## register hook


#    for n,m in model.named_children():
#	print("name = ",n)
#	for n1, m1 in m.named_children():
#	    print("name 1 = ", n1 )
#	    m1.register_backward_hook(print_module_name)
#	    m1.register_forward_hook( print_module_name_fwd)
#
# define loss function (criterion) and optimizer
    if GPUMODE:
        criterion = LArFlowLoss(VISI_WEIGHT)
    else:
        criterion = LArFlowLoss(VISI_WEIGHT)

    # training parameters
    lr = 1.0e-4
    momentum = 0.9
    weight_decay = 1.0e-4

    # training length
    batchsize_train = 12  #*len(DEVICE_IDS)
    batchsize_valid = 6  #*len(DEVICE_IDS)
    start_epoch = 0
    epochs = 10
    num_iters = 10000
    iter_per_epoch = None  # determined later
    iter_per_valid = 10
    iter_per_checkpoint = 500

    nbatches_per_itertrain = 10
    itersize_train = batchsize_train * nbatches_per_itertrain
    trainbatches_per_print = 100

    nbatches_per_itervalid = 20
    itersize_valid = batchsize_valid * nbatches_per_itervalid
    validbatches_per_print = 100

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    # optimize algorithms based on input size (good if input size is constant)
    cudnn.benchmark = True

    # LOAD THE DATASET

    iotrain = LArCVDataset(TRAIN_LARCV_CONFIG, "ThreadProcessorTrain")
    iovalid = LArCVDataset(VALID_LARCV_CONFIG, "ThreadProcessorValid")
    iotrain.start(batchsize_train)
    iovalid.start(batchsize_valid)

    NENTRIES = len(iotrain)
    print "Number of entries in training set: ", NENTRIES

    if NENTRIES > 0:
        iter_per_epoch = NENTRIES / (itersize_train)
        if num_iters is None:
            # we set it by the number of request epochs
            num_iters = (epochs - start_epoch) * NENTRIES
        else:
            epochs = num_iters / NENTRIES
    else:
        iter_per_epoch = 1

    print "Number of epochs: ", epochs
    print "Iter per epoch: ", iter_per_epoch

    with torch.autograd.profiler.profile(enabled=RUNPROFILER) as prof:

        for ii in range(start_iter, num_iters):

            adjust_learning_rate(optimizer, ii, lr)
            print "MainLoop Iter:%d Epoch:%d.%d " % (ii, ii / iter_per_epoch,
                                                     ii % iter_per_epoch),
            for param_group in optimizer.param_groups:
                print "lr=%.3e" % (param_group['lr']),
                print

            # train for one iteration
            try:
                train_ave_loss, train_ave_acc = train(
                    iotrain, batchsize_train, model, criterion, optimizer,
                    nbatches_per_itertrain, ii, trainbatches_per_print)
    #mem_report()

            except Exception, e:
                print "Error in training routine!"
                print e.message
                print e.__class__.__name__
                traceback.print_exc(e)
                break
            print "Train Iter:%d Epoch:%d.%d train aveloss=%.3f aveacc=%.3f" % (
                ii, ii / iter_per_epoch, ii % iter_per_epoch, train_ave_loss,
                train_ave_acc)

            # evaluate on validation set
            if ii % iter_per_valid == 0:
                try:
                    prec1 = validate(iovalid, batchsize_valid, model,
                                     criterion, nbatches_per_itervalid,
                                     validbatches_per_print, ii)
                except Exception, e:
                    print "Error in validation routine!"
                    print e.message
                    print e.__class__.__name__
                    traceback.print_exc(e)
                    break

                # remember best prec@1 and save checkpoint
                is_best = prec1 > best_prec1
                best_prec1 = max(prec1, best_prec1)

                # check point for best model
                if is_best:
                    print "Saving best model"
                    save_checkpoint(
                        {
                            'iter': ii,
                            'epoch': ii / iter_per_epoch,
                            'state_dict': model.state_dict(),
                            'best_prec1': best_prec1,
                            'optimizer': optimizer.state_dict(),
                        }, is_best, -1)

            # periodic checkpoint
            if ii > 0 and ii % iter_per_checkpoint == 0:
                print "saving periodic checkpoint"
                save_checkpoint(
                    {
                        'iter': ii,
                        'epoch': ii / iter_per_epoch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict(),
                    }, False, ii)
            # flush the print buffer after iteration
            sys.stdout.flush()
Exemplo n.º 2
0
class LArCV2ThreadIOWorker( WorkerService ):
    """ This worker simply receives data and replies with dummy string. prints shape of array. """

    def __init__( self,configfile,fillername,identity,ipaddress,port=0,batchsize=None,verbosity=0):
        super( LArCV2ThreadIOWorker, self ).__init__(identity,ipaddress)
        self.configfile = configfile
        self.fillername = fillername
        self.batchsize = batchsize
        self.larcvloader = LArCVDataset(self.configfile,fillername)
        self.products = {}
        self.compression_level = 4
        self.print_msg_size = False
        self.num_reads = 0
        if self.batchsize is not None:
            self.start_dataloader(self.batchsize)
        print "LArCV2ThreadIOWorker[{}] is loaded.".format(self._identity)
        
    def process_message(self, frames ):
        """ just a request. nothing to parse
        """
        return True

    def fetch_data(self):
        """ load up the next data set. we've already sent out the message. so here we try to hide latency while gpu running. """
        
        # get data
        tstart = time.time()
        while self.larcvloader.io._proc.thread_running():
            #print "finish load"
            time.sleep(0.001)
        self.products = self.larcvloader[0]
        while self.larcvloader.io._proc.thread_running():
            #print "finish load"
            time.sleep(0.001)        
        #print "[",self.num_reads,":{}] ".format(self._identity),self.products.keys()
        self.num_reads += 1        
        print "LArCV2ThreadIOWorker[{}] fetched data. time={} secs. nreads={}".format(self._identity,time.time()-tstart,self.num_reads)
        return
    
    def generate_reply(self):
        """
        our job is to return our data set, then load another
        """
        self.fetch_data()
        
        reply = [self._identity]
        totmsgsize = 0.0
        totcompsize = 0.0
        tstart = time.time()
        for key,arr in self.products.items():
                
            # encode
            x_enc = msgpack.packb( arr, default=m.encode )
            x_comp = zlib.compress(x_enc,self.compression_level)

            # for debug: inspect compression gains (usually reduction to 1% or lower of original size)
            if self.print_msg_size:
                encframe = zmq.Frame(x_enc)
                comframe = zmq.Frame(x_comp)
                totmsgsize  += len(encframe.bytes)
                totcompsize += len(comframe.bytes)
                
            reply.append( key.encode('utf-8') )
            reply.append( x_comp )

        if self.print_msg_size:
            print "LArCV2ThreadIOWorker[{}]: size of array portion={} MB (uncompressed {} MB)".format(self._identity,totcompsize/1.0e6,totmsgsize/1.0e6)
        print "LArCV2ThreadIOWorker[{}]: generate msg in {} secs".format(self._identity,time.time()-tstart)
        return reply
        
    def start_dataloader(self,batchsize):
        print "LArCV2ThreadIOWorker[{}] starting loader w/ batchsize={}".format(self._identity,self.batchsize)
        self.batchsize = batchsize
        self.larcvloader.start(self.batchsize)
        print "LArCV2ThreadIOWorker[{}] dataloader ready, loading first product set".format(self._identity,self.batchsize)
        while not self.larcvloader.io._proc.manager_started():
            time.sleep(1.0)            
            print "LArCV2ThreadIOWorker[{}] waiting for larcv_threadio".format(self._identity)
        #self.post_reply() # get first batch
        print "LArCV2ThreadIOWorker[{}] manager started. syncing with client".format(self._identity)
Exemplo n.º 3
0
import os, sys
import ROOT as rt
from larcv import larcv
from uresnet import UResNet
from larcvdataset import LArCVDataset

#net = UResNet( num_classes=3, input_channels=1, inplanes=16 )

# we load in a test image
#iotest = LArCVDataset("test_dataloader.cfg", "ThreadProcessorTest")
iotest = LArCVDataset("test_threadfiller.cfg", "ThreadProcessorTest")
iotest.start(1)

data = iotest[0]
print data
#print net
iotest.stop()
Exemplo n.º 4
0
def main():

    global best_prec1
    global writer

    # create model, mark it to run on the GPU
    if GPUMODE:
        model = UResNet(inplanes=32,
                        input_channels=1,
                        num_classes=NCLASSES,
                        showsizes=False)
        model.to(device=torch.device(DEVICE))  # put onto gpuid
    else:
        model = UResNet(inplanes=32, input_channels=1, num_classes=NCLASSES)

    # Resume training option
    if RESUME_FROM_CHECKPOINT:
        print "RESUMING FROM CHECKPOINT FILE ", CHECKPOINT_FILE
        checkpoint = torch.load(
            CHECKPOINT_FILE,
            map_location=CHECKPOINT_MAP_LOCATIONS)  # load weights to gpuid
        best_prec1 = checkpoint["best_prec1"]
        if CHECKPOINT_FROM_DATA_PARALLEL:
            model = nn.DataParallel(
                model, device_ids=DEVICE_IDS)  # distribute across device_ids
        model.load_state_dict(checkpoint["state_dict"])

    if not CHECKPOINT_FROM_DATA_PARALLEL and len(DEVICE_IDS) > 1:
        model = nn.DataParallel(
            model, device_ids=DEVICE_IDS)  # distribute across device_ids

    # uncomment to dump model
    print "Loaded model: ", model
    # check where model pars are
    #for p in model.parameters():
    #    print p.is_cuda

    # define loss function (criterion) and optimizer
    if GPUMODE:
        criterion = PixelWiseNLLLoss()
        criterion.to(device=torch.device(DEVICE))
    else:
        criterion = PixelWiseNLLLoss()

    # training parameters
    lr = 1.0e-5
    momentum = 0.9
    weight_decay = 1.0e-4

    # training length
    if "cuda" in DEVICE:
        batchsize_train = 4 * len(DEVICE_IDS)
        batchsize_valid = 2 * len(DEVICE_IDS)
    else:
        batchsize_train = 4
        batchsize_valid = 2

    start_epoch = 0
    epochs = 10
    num_iters = 30000
    iter_per_epoch = None  # determined later
    iter_per_valid = 10
    iter_per_checkpoint = 500

    nbatches_per_itertrain = 20
    itersize_train = batchsize_train * nbatches_per_itertrain
    trainbatches_per_print = 100

    nbatches_per_itervalid = 40
    itersize_valid = batchsize_valid * nbatches_per_itervalid
    validbatches_per_print = 100

    # SETUP OPTIMIZER

    # SGD w/ momentum
    #optimizer = torch.optim.SGD(model.parameters(), lr,
    #                            momentum=momentum,
    #                            weight_decay=weight_decay)

    # ADAM
    # betas default: (0.9, 0.999) for (grad, grad^2). smoothing coefficient for grad. magnitude calc.
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)

    # optimize algorithms based on input size (good if input size is constant)
    cudnn.benchmark = True

    # LOAD THE DATASET

    iotrain = LArCVDataset(TRAIN_LARCV_CONFIG, "ThreadProcessorTrain")
    iovalid = LArCVDataset(VALID_LARCV_CONFIG, "ThreadProcessorValid")
    iotrain.start(batchsize_train)
    iovalid.start(batchsize_valid)
    iosample = {"valid": iovalid, "train": iotrain}

    NENTRIES = len(iotrain)
    print "Number of entries in training set: ", NENTRIES

    if NENTRIES > 0:
        iter_per_epoch = NENTRIES / (itersize_train)
        if num_iters is None:
            # we set it by the number of request epochs
            num_iters = (epochs - start_epoch) * NENTRIES
        else:
            epochs = num_iters / NENTRIES
    else:
        iter_per_epoch = 1

    print "Number of epochs: ", epochs
    print "Iter per epoch: ", iter_per_epoch

    if False:
        # for debugging/testing data
        sample = "train"
        print "TEST BATCH: sample=", sample
        adc_t, label_t, weight_t = prep_data(iosample[sample], sample,
                                             batchsize_train, IMAGE_WIDTH,
                                             IMAGE_HEIGHT, ADC_THRESH)
        print "adc shape: ", adc_t.shape
        print "label shape: ", label_t.shape
        print "weight shape: ", weight_t.shape

        # load opencv, to dump png of image
        import cv2 as cv
        cv.imwrite("testout_adc.png", adc_t.numpy()[0, 0, :, :])
        cv.imwrite("testout_label.png", label_t.numpy()[0, :, :])
        cv.imwrite("testout_weight.png", weight_t.numpy()[0, 0, :, :])

        print "STOP FOR DEBUGGING"
        iotrain.stop()
        iovalid.stop()
        sys.exit(-1)

    with torch.autograd.profiler.profile(enabled=RUNPROFILER) as prof:

        # Resume training option
        #if RESUME_FROM_CHECKPOINT:
        #    print "RESUMING FROM CHECKPOINT FILE ",CHECKPOINT_FILE
        #    checkpoint = torch.load( CHECKPOINT_FILE, map_location=CHECKPOINT_MAP_LOCATIONS )
        #    best_prec1 = checkpoint["best_prec1"]
        #    model.load_state_dict(checkpoint["state_dict"])
        #optimizer.load_state_dict(checkpoint['optimizer'])
        #if GPUMODE:
        #    optimizer.cuda(GPUID)

        for ii in range(start_iter, num_iters):

            adjust_learning_rate(optimizer, ii, lr)
            print "MainLoop Iter:%d Epoch:%d.%d " % (ii, ii / iter_per_epoch,
                                                     ii % iter_per_epoch),
            for param_group in optimizer.param_groups:
                print "lr=%.3e" % (param_group['lr']),
                print

            # train for one iteration
            try:
                train_ave_loss, train_ave_acc = train(iotrain, batchsize_train,
                                                      model, criterion,
                                                      optimizer,
                                                      nbatches_per_itertrain,
                                                      ii, NCLASSES,
                                                      trainbatches_per_print)
            except Exception, e:
                print "Error in training routine!"
                print e.message
                print e.__class__.__name__
                traceback.print_exc(e)
                break
            print "Train Iter:%d Epoch:%d.%d train aveloss=%.3f aveacc=%.3f" % (
                ii, ii / iter_per_epoch, ii % iter_per_epoch, train_ave_loss,
                train_ave_acc)

            # evaluate on validation set
            if ii % iter_per_valid == 0:
                try:
                    prec1 = validate(iovalid, batchsize_valid, model,
                                     criterion, nbatches_per_itervalid,
                                     validbatches_per_print, ii)
                except Exception, e:
                    print "Error in validation routine!"
                    print e.message
                    print e.__class__.__name__
                    traceback.print_exc(e)
                    break

                # remember best prec@1 and save checkpoint
                is_best = prec1 > best_prec1
                best_prec1 = max(prec1, best_prec1)

                # check point for best model
                if is_best:
                    print "Saving best model"
                    save_checkpoint(
                        {
                            'iter': ii,
                            'epoch': ii / iter_per_epoch,
                            'state_dict': model.state_dict(),
                            'best_prec1': best_prec1,
                            'optimizer': optimizer.state_dict(),
                        }, is_best, -1)

            # periodic checkpoint
            if ii > 0 and ii % iter_per_checkpoint == 0:
                print "saving periodic checkpoint"
                save_checkpoint(
                    {
                        'iter': ii,
                        'epoch': ii / iter_per_epoch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict(),
                    }, False, ii)
            # flush the print buffer after iteration
            sys.stdout.flush()
Exemplo n.º 5
0
def main():

    global best_prec1_vis
    global best_prec1_flow
    global writer
    
    model = network.mymodel( num_classes=1, input_channels=1, showsizes=False)
    model.cuda()
    #print "Loaded model: ",model

    # define loss function (criterion) and optimizer
    criterion1 = myfunc.PixelWiseFlowLoss(minval=4).cuda()
    criterion2 = myfunc.PixelWiseNLLLoss().cuda()
    
    # training parameters
    lmbd = 0.5
    lr = 1.0e-4 #-3 
    momentum = 0.9
    weight_decay = 1.0e-3
    batchsize_train = 8
    batchsize_valid = 8
    start_epoch = 0
    epochs      = 50 #1500
    nbatches_per_iter = 25
    
    if len(sys.argv)>1:
        epochs = int(sys.argv[1])
    print "Number of epochs: ", epochs
    print "Train batch: ", batchsize_train
    print "# batch per iter: ", nbatches_per_iter

    optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True

    # dataset
    #iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor", loadallinmem=True)
    iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor")
    iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest")
    
    iotrain.start(batchsize_train)
    iovalid.start(batchsize_valid)

    #nbatch per epoch
    NENTRIES = iotrain.io.fetch_n_entries()
    
    #NENTRIES=0;
    if NENTRIES>0:
            nbatches_per_epoch = NENTRIES/batchsize_train
            nbatches_per_valid = NENTRIES/batchsize_valid
    else:
            nbatches_per_epoch = 1
            nbatches_per_valid = 1
                

    iter_per_epoch = nbatches_per_epoch/nbatches_per_iter
    iter_per_valid = 5
    iter_per_checkpoint = 150
    num_iters = iter_per_epoch*epochs
    print "Iterations: ", num_iters
    # Resume training option
    if False:
        checkpoint = torch.load( "checkpoint.pth.p01.tar" )
        best_prec1 = checkpoint["best_prec1"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint['optimizer'])


    for ii in range(0, num_iters):
        
        myfunc.adjust_learning_rate(optimizer, ii, lr)
        print "Iter:%d Epoch:%d.%d "%(ii,ii/iter_per_epoch,ii%iter_per_epoch),
        for param_group in optimizer.param_groups:
            print "lr=%.3e"%(param_group['lr']),
            print
            
            # train for one epoch
            try:
                train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(iotrain, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_iter, ii, 10)
                
            except Exception,e:
                print "Error in training routine!"
                print e.message
                print e.__class__.__name__
                traceback.print_exc(e)
                break
            print "Iter:%d Epoch [%d.%d] train aveloss=%.3f aveacc_vis=%.3f aveacc_flow=%.3f"%(ii,ii/iter_per_epoch,ii%iter_per_epoch,
                                                                                               train_ave_loss,train_ave_acc_vis,train_ave_acc_flow)             
            # evaluate on validation set
            if ii%iter_per_valid==0:
                try:
                    prec1_vis, prec1_flow = validate(iovalid, model, criterion1, criterion2, lmbd, nbatches_per_iter, ii, 10)
                except Exception,e:
                    print "Error in validation routine!"
                    print e.message
                    print e.__class__.__name__
                    traceback.print_exc(e)
                    break
                
                # remember best prec@1 and save checkpoint
                is_best_flow = prec1_flow > best_prec1_flow
                best_prec1_flow = max(prec1_flow, best_prec1_flow)
                is_best_vis = prec1_vis > best_prec1_vis
                best_prec1_vis = max(prec1_vis, best_prec1_vis)
                
                # check point for best model
                if is_best_flow:
                    print "Saving best model"
                    myfunc.save_checkpoint({
                        'iter':ii,
                        'epoch': ii/iter_per_epoch,
                        'state_dict': model.state_dict(),
                        'best_prec1_vis': best_prec1_vis,
                        'best_prec1_flow': best_prec1_flow,
                        'optimizer' : optimizer.state_dict(),
                    }, is_best_flow, -1)
                    
            # periodic checkpoint
            if ii>0 and ii%iter_per_checkpoint==0:
                print "saving periodic checkpoint"
                myfunc.save_checkpoint({
                    'iter':ii,
                    'epoch': ii/iter_per_epoch,
                    'state_dict': model.state_dict(),
                    'best_prec1_vis': best_prec1_vis,
                    'best_prec1_flow': best_prec1_flow,
                    'optimizer' : optimizer.state_dict(),
                }, False, ii)
Exemplo n.º 6
0
def main():

    global best_prec1

    # create model: loading resnet18 as defined in torchvision module
    #model = resnet_example.resnet18(pretrained=False, num_classes=5, input_channels=1)
    model = resnet_example.resnet14(pretrained=False,
                                    num_classes=5,
                                    input_channels=1)
    model.cuda()

    print "Loaded model: ", model

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # training parameters
    lr = 1.0e-3
    momentum = 0.9
    weight_decay = 1.0e-3
    batchsize = 50
    batchsize_valid = 500
    start_epoch = 0
    epochs = 1500
    nbatches_per_epoch = 10000 / batchsize
    nbatches_per_valid = 1000 / batchsize_valid

    optimizer = torch.optim.SGD(model.parameters(),
                                lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True

    # dataset
    iotrain = LArCVDataset("train_dataloader.cfg",
                           "ThreadProcessor",
                           loadallinmem=True)
    iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest")

    iotrain.start(batchsize)
    iovalid.start(batchsize_valid)

    # Resume training option
    if False:
        checkpoint = torch.load("checkpoint.pth.p01.tar")
        best_prec1 = checkpoint["best_prec1"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint['optimizer'])

    if False:
        data = iotrain[0]
        img = data["image"]
        lbl = data["label"]
        img_np = np.zeros((img.shape[0], 1, 256, 256), dtype=np.float32)
        lbl_np = np.zeros((lbl.shape[0]), dtype=np.int)
        for j in range(img.shape[0]):
            imgtemp = img[j].reshape((256, 256))
            print imgtemp.shape
            img_np[j, 0, :, :] = padandcrop(imgtemp)
            lbl_np[j] = np.argmax(lbl[j])

        print "Train label"
        print lbl_np

        datatest = iovalid[0]
        imgtest = data["image"]
        print "Test image shape"
        print imgtest.shape

        iotrain.stop()
        iovalid.stop()

        return

    for epoch in range(start_epoch, epochs):

        adjust_learning_rate(optimizer, epoch, lr)
        print "Epoch [%d]: " % (epoch),
        for param_group in optimizer.param_groups:
            print "lr=%.3e" % (param_group['lr']),
        print

        # train for one epoch
        try:
            train_ave_loss, train_ave_acc = train(iotrain, model, criterion,
                                                  optimizer,
                                                  nbatches_per_epoch, epoch,
                                                  50)
        except Exception, e:
            print "Error in training routine!"
            print e.message
            print e.__class__.__name__
            traceback.print_exc(e)
            break
        print "Epoch [%d] train aveloss=%.3f aveacc=%.3f" % (
            epoch, train_ave_loss, train_ave_acc)

        # evaluate on validation set
        try:
            prec1 = validate(iovalid, model, criterion, nbatches_per_valid, 1)
        except Exception, e:
            print "Error in validation routine!"
            print e.message
            print e.__class__.__name__
            traceback.print_exc(e)
            break
Exemplo n.º 7
0
def main():

    global best_prec1_vis
    global best_prec1_flow
    global writer
    
    model = network.mymodel( num_classes=1, input_channels=1, showsizes=False)
    model.cuda()
    #print "Loaded model: ",model

    # define loss function (criterion) and optimizer
    criterion1 = myfunc.PixelWiseFlowLoss(minval=4).cuda()
    criterion2 = myfunc.PixelWiseNLLLoss().cuda()
    
    # training parameters
    lmbd = 0.5
    lr = 1.0e-4 #-3 
    momentum = 0.9
    weight_decay = 1.0e-3
    batchsize = 8
    batchsize_valid = 8
    start_epoch = 0
    epochs      = 50 #1500
    if len(sys.argv)>1:
        epochs = int(sys.argv[1])
    print "Number of epochs: ", epochs
    print "Train batch: ", batchsize
    

    optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    cudnn.benchmark = True

    # dataset
    #iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor", loadallinmem=True)
    iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor")
    iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest")
    
    iotrain.start(batchsize)
    iovalid.start(batchsize_valid)

    #nbatch per epoch
    NENTRIES = iotrain.io.fetch_n_entries()
    #NENTRIES=0;
    if NENTRIES>0:
            nbatches_per_epoch = NENTRIES/batchsize
            nbatches_per_valid = NENTRIES/batchsize_valid
    else:
            nbatches_per_epoch = 1
            nbatches_per_valid = 1
                
                
    # Resume training option
    if False:
        checkpoint = torch.load( "checkpoint.pth.p01.tar" )
        best_prec1 = checkpoint["best_prec1"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint['optimizer'])
    
    if False: #debug
        data = iotrain[0]
        img  = data["imageY"]
        img2 = data["imageU"]
        lbl  = data["label"]
        vis  = data["match"]
        '''
        img_np  = np.zeros( (img.shape[0],  1, 512, 512), dtype=np.float32 )
        img2_np = np.zeros( (img2.shape[0], 1, 512, 512), dtype=np.float32 )
        lbl_np  = np.zeros( (lbl.shape[0], 1, 512, 512), dtype=np.int )
        vis_np  = np.zeros( (vis.shape[0], 512, 512), dtype=np.int )
        fvis_np  = np.zeros( (vis.shape[0], 1, 512, 512), dtype=np.float32 )

        for j in range(img.shape[0]):
            img_np[j,0,:,:]  = img[j].reshape( (512,512) )
            img2_np[j,0,:,:] = img2[j].reshape( (512,512) )
            lbl_np[j,0,:,:]  = lbl[j].reshape( (512,512) )
            vis_np[j,:,:]    = vis[j].reshape( (512,512) )
            fvis_np[j,0,:,:]  = vis[j].reshape( (512,512) ) 
        '''
        img_np  = np.zeros( ( 512, 512), dtype=np.float32 )
        img2_np = np.zeros( ( 512, 512), dtype=np.float32 )
        lbl_np  = np.zeros( ( 512, 512), dtype=np.int )
        vis_np  = np.zeros( ( 512, 512), dtype=np.int )
        fvis_np = np.zeros( ( 512, 512), dtype=np.float32 )

        for j in range(1):#img.shape[0]):
            img_np[:,:]  = img[j].reshape( (512,512) )
            img2_np[:,:] = img2[j].reshape( (512,512) )
            lbl_np[:,:]  = lbl[j].reshape( (512,512) )
            vis_np[:,:]  = vis[j].reshape( (512,512) )
            fvis_np[:,:] = vis[j].reshape( (512,512) ) 

        tar_x_visi = np.multiply(lbl_np,fvis_np)
        abs_tar_x_visi = np.fabs(tar_x_visi)
        thresh =   abs_tar_x_visi >0
        threshint = thresh.astype(int)
        
        datatest = iovalid[0]
        imgtest = datatest["imageYtest"]
        print "Test image shape"
        print imgtest.shape

        cv.imwrite( "testout_srcY.png", img_np  )
        cv.imwrite( "testout_srcU.png", img2_np  )
        cv.imwrite( "testout_tar.png", lbl_np  )
        cv.imwrite( "testout_vis.png", fvis_np*100  )
        cv.imwrite( "testout_tarXvis.png", tar_x_visi  )
        cv.imwrite( "testout_abs_tarXvis.png", abs_tar_x_visi*100  )
        cv.imwrite( "testout_thresh_tarXvis.png", threshint*100  )
        
        iotrain.stop()
        iovalid.stop()
        
        return


    #data = iotrain[0]
    #data2 = iovalid[0]
    for epoch in range(start_epoch, epochs):

        myfunc.adjust_learning_rate(optimizer, epoch, lr)
        print "Epoch [%d]: "%(epoch),
        for param_group in optimizer.param_groups:
            print "lr=%.3e"%(param_group['lr']),
        print

        # train for one epoch
        try:
            train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(iotrain, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_epoch, epoch, 100)
            #train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(data, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_epoch, epoch, 50)
        except Exception,e:
            print "Error in training routine!"            
            print e.message
            print e.__class__.__name__
            traceback.print_exc(e)
            break
        print "Epoch [%d] train aveloss=%.3f aveacc_vis=%.3f aveacc_flow=%.3f"%(epoch,train_ave_loss,train_ave_acc_vis,train_ave_acc_flow)

        # evaluate on validation set
        try:
            prec1_vis, prec1_flow = validate(iovalid, model, criterion1, criterion2, lmbd, nbatches_per_valid, epoch, 100)
            #prec1_vis, prec1_flow = validate(data2, model, criterion1, criterion2, lmbd, nbatches_per_valid, epoch, 50)
        except Exception,e:
            print "Error in validation routine!"            
            print e.message
            print e.__class__.__name__
            traceback.print_exc(e)
            break