Пример #1
0
def gen_folds( dataset, options, nrun ):
    nids = len( set( dataset[0,:] ) )
    ids  = options['numpy_rng'].permutation(nids)

    # train / test ids
    trainsizeElem = round( options['trainsize']*nids )
    train_ids = ids[0:trainsizeElem]    
    test_ids  = ids[trainsizeElem+1:nids]

    print >> sys.stderr, test_ids   

    if options['verbose']> 2:
        print >> sys.stderr, "Train IDS"
        print >> sys.stderr, train_ids
        print >> sys.stderr, "Test IDS"
        print >> sys.stderr, test_ids

    # val ids
    val_ids   = numpy.copy(train_ids)
    nitems    = len(val_ids)/options['folds']

    val_ids.resize((options['folds'],nitems))

    folds = range(0,options['folds'])

    trainval = []
    valval   = []
    testval  = []
    for k in folds:
        others = list( set([k]).symmetric_difference(set(folds)) )
        #print val_ids
        #print others
        #kk

        train  = val_ids[k].flatten()
        val    = val_ids[others[0]]
        test   = val_ids[others[1]]

        xtrain,ytrain,minv,maxv = get_data( dataset, train, options )
        xval,yval   = get_data( dataset,  val, options, isFirst=False, minvalue=minv, maxvalue=maxv )[0:2]
        xtest,ytest = get_data( dataset, test, options, isFirst=False, minvalue=minv, maxvalue=maxv )[0:2]
        
        trainval.append( (xtrain,ytrain) )
        valval.append( (xval,yval) )
        testval.append( (xtest,ytest) )

        if options['verbose'] > 0:
            print 'Train set with size %d for fold %d' % (ytrain.shape.eval(),k)
            print 'Test  set with size %d for fold %d' % (ytest.shape.eval(),k)
            if options['verbose'] > 5:
                for cls in range(0,2):
                    print >>sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(cls,sum(ytrain.eval() == cls))
                    print >>sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(cls,sum(ytest.eval() == cls))

    # final ids
    final_ids = numpy.copy(train_ids)
    nitems    = len(final_ids)/2
    final_ids.resize((2,nitems))
    trainfinal_ids = final_ids[0]
    valfinal_ids   = final_ids[1]

    xtrain,ytrain,minv,maxv = get_data( dataset, trainfinal_ids, options )
    xval,yval    = get_data( dataset, valfinal_ids, options, isFirst = False, minvalue=minv, maxvalue=maxv )[0:2]
    xtest,ytest  = get_data( dataset, test_ids    , options, isFirst = True , minvalue=minv, maxvalue=maxv )[0:2]

    trainFinal = (xtrain,ytrain)
    valFinal   = (xval,yval)
    testFinal  = (xtest,ytest)

    print >> sys.stderr, test_ids
    
    if options['verbose'] > 0:
        print 'Train set with size %d ' % (ytrain.shape.eval())
        print 'Val  set with size %d ' % (yval.shape.eval())
        print 'Test  set with size %d ' % (ytest.shape.eval())
        if options['verbose'] > 5:
            for cls in range(0,2):
                print >>sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(cls,sum(ytrain.eval() == cls))
                print >>sys.stderr, "\tNumber of validation elements for cls {0:02d} is {1:05d}".format(cls,sum(yval.eval() == cls))
                print >>sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(cls,sum(ytest.eval() == cls))

    basefilename = '{0:s}/{1:05d}_{2:03d}_'.format(options['outputfolder'],nrun,string.atoi(options['resolution']))

    trainfilename = basefilename + 'train_ids.pkl.gz'
    valfilename   = basefilename + 'val_ids.pkl.gz'
    trainfinalfilename = basefilename + 'trainfinal_ids.pkl.gz'
    valfinalfilename   = basefilename + 'valfinal_ids.pkl.gz'
    testfilename       = basefilename + 'test_ids.pkl.gz'

    save_gzdata(trainfilename,train_ids)
    save_gzdata(valfilename,val_ids)
    save_gzdata(trainfinalfilename,trainfinal_ids)
    save_gzdata(valfinalfilename,valfinal_ids)
    save_gzdata(testfilename,test_ids)
    
    if options['verbose'] > 0:
        print 'Train set with size %d' % (trainFinal[1].shape.eval())
        print 'Val   set with size %d' % (valFinal[1].shape.eval())
        print 'Test  set with size %d' % (testFinal[1].shape.eval())

    rval = [trainval, valval, testval, trainFinal, valFinal, testFinal ]

    return rval
Пример #2
0
def pretrain_finetune_model(sda, pretraining_fns, train_set, test_set,
                            options):
    train_set_x, train_set_y = train_set

    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= options['batchsize']

    if options['retrain'] == 0:

        bestmodelsda = copy.copy(sda)
        # -----------------------------------------------
        # PRETRAINING
        # -----------------------------------------------
        if options['verbose'] > 5:
            print >> sys.stderr, ('... pre-training the model')
        start_time = time.clock()
        ## Pre-train layer-wise
        corruption_levels = options['corruptlevels']
        for i in xrange(sda.n_layers):
            # go through pretraining epochs
            for epoch in xrange(options['pretraining_epochs']):
                # go through the training set
                c = []
                for batch_index in xrange(n_train_batches):
                    c.append(pretraining_fns[i](
                        index=batch_index,
                        corruption=corruption_levels[i],
                        lr=options['pretrain_lr']))

                if epoch % 100 == 0 and options['verbose'] > 5:
                    print >> sys.stderr, (
                        'Pre-training layer %02i, epoch %04d, cost ' %
                        (i, epoch)),
                    print >> sys.stderr, (numpy.mean(c))
        end_time = time.clock()
        if options['savetimes']:
            filename = '{0:s}/times_pr_{1:03d}_{2:03d}.pkl.gz'.format(
                options['outputfolderres'], options['nrun'],
                string.atoi(options['resolution']))
            save_gzdata(filename, end_time - start_time)

        if options['verbose'] > 4:
            print >> sys.stderr, ('The pretraining code for file ' +
                                  os.path.split(__file__)[1] +
                                  ' ran for %.2fm' %
                                  ((end_time - start_time) / 60.))

        # get the training, validation and testing function for the model
        #dataset = [folds[0][0], folds[1][0], folds[2]]
        dataset = [train_set, test_set]

        if options['verbose'] > 5:
            print >> sys.stderr, ('... getting the finetuning functions')
        train_fn, validate_model = sda.build_finetune_functions(
            datasets=dataset,
            batch_size=options['batchsize'],
            learning_rate=options['finetune_lr'])

    else:
        dataset = [train_set, test_set]

        train_fn, validate_model = sda.build_finetune_functions_reuse(
            datasets=dataset,
            batch_size=options['batchsize'],
            learning_rate=options['finetune_lr'],
            update_layerwise=options['retrain_ft_layers'])

    # ------------------------------------------------------------------------------------------------

    # -----------------------------------------------
    # FINETUNE
    # -----------------------------------------------
    if options['verbose'] > 5:
        print >> sys.stderr, ('... finetunning the model')
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 2.  #2. # wait this much longer when a new best is found
    improvement_threshold = 0.995  # 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.

    start_time = time.clock()

    done_looping = False
    epoch = 0

    while (epoch < options['training_epochs']) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_fn(minibatch_index)

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                #this_validation_loss    = numpy.mean( validate_model() )
                (y_valid, y_pred, y_pred_prob) = validate_model()

                # pos = numpy.random.randint(len(y_pred),size=(100,))
                # print options
                # print pos
                # print y_pred_prob[pos,:].T
                # raw_input()
                # alll

                # we are going to control the predictions according to their prob
                if options['threshold'] != None:
                    y_pred = numpy.array(
                        y_pred_prob[:, 0] < options['threshold'],
                        dtype=numpy.uint8)
                else:
                    y_pred = numpy.argmax(y_pred_prob, axis=1)

                this_validation_loss = evaluate_error(y_valid, y_pred, options)

                # if epoch % 10 == 0:
                #     cm = confusion_matrix(y_valid, y_pred, options['nclasses'])
                #     print >> sys.stderr, cm, this_validation_loss

                # print >> sys.stderr, this_validation_loss
                # this_validation_loss = numpy.mean(validation_losses)
                if epoch % 30 == 0 and options['verbose'] > 5:
                    # print >> sys.stderr, y_valid
                    # print >> sys.stderr, y_pred_prob
                    # print >> sys.stderr, y_pred
                    # print >> sys.stderr, y_valid.shape
                    # print >> sys.stderr, y_pred.shape
                    # print >> sys.stderr, y_pred_prob[1:10,:], y_pred[1:10], y_valid[1:10]
                    # print >> sys.stderr, test_set[1].eval()

                    print >> sys.stderr, (
                        'epoch %04i, minibatch %04i/%04i, validation error %03f %%'
                        % (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    bestmodelsda = copy.copy(sda)

                    # % ------------------------------------------------------------
                    if options['oneclass'] == True:
                        options['nclasses'] = 2

                    #print >> sys.stderr, sda.params[-2].get_value().T, sda.params[-1].get_value()
                    pos = numpy.random.randint(len(y_pred), size=(10, ))
                    # print options
                    # print pos
                    # print y_pred_prob.shape
                    #print >> sys.stderr, options['threshold']
                    # print >> sys.stderr, numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8)
                    #print >> sys.stderr, y_pred_prob[pos,:].T
                    #print >> sys.stderr, y_pred[pos]
                    cm = confusion_matrix(y_valid, y_pred, options['nclasses'])
                    #print >> sys.stderr, ("Fine tune...epoch %04i" %  epoch)
                    #print >> sys.stderr, this_validation_loss
                    #print >> sys.stderr, cm
                    # options['nclasses'] = 1
                    # % ------------------------------------------------------------

                    # improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    if patience <= iter:
                        done_looping = True
                        break

    end_time = time.clock()

    if options['savetimes']:
        filename = '{0:s}/times_fn_{1:03d}_{2:03d}.pkl.gz'.format(
            options['outputfolderres'], options['nrun'],
            string.atoi(options['resolution']))
        save_gzdata(filename, end_time - start_time)

    print >> sys.stderr, ("Stopped at epoch %04i" % epoch)
    return (best_validation_loss, bestmodelsda)
Пример #3
0
def do_experiment(folds, options, nrun, sda_reuse_model):

    modeloptions = {}
    param = list(
        itertools.product(options['nneurons'], options['hlayers'],
                          options['pretraining_epochs'],
                          options['training_epochs'], options['pretrain_lr'],
                          options['finetune_lr'], options['batchsize'],
                          options['threshold'], options['corruptlevels']))

    print >> sys.stderr, param

    print >> sys.stderr, ('Number of combinations {0:03d}'.format(len(param)))

    step = 0
    besterror = numpy.inf
    # ---------------------------------------------------------------
    # cross validation
    # ---------------------------------------------------------------
    for k in range(0, len(param)):

        (nneurons, hlayers, pretraining_epochs, training_epochs, pretrain_lr,
         finetune_lr, batchsize, threshold, corruptlevels) = param[k]

        modeloptions = {
            'savetimes':
            False,
            'outputfolder':
            options['outputfolder'],
            'outputfolderres':
            options['outputfolderres'],
            'resolution':
            options['resolution'],
            'retrain':
            options['retrain'],
            'verbose':
            options['verbose'],
            'ndim':
            options['ndim'],
            'nclasses_source':
            options['nclasses_source'],
            'nclasses':
            options['nclasses'],
            'numpy_rng':
            options['numpy_rng'],
            'theano_rng':
            options['theano_rng'],
            'measure':
            options['measure'],
            'oneclass':
            options['oneclass'],
            'batchsize':
            batchsize,
            'hlayers':
            nneurons * numpy.ones((hlayers, )),
            # numpy.array(nneurons * numpy.ones((hlayers,)) * (1/(2*numpy.arange(1,hlayers+1)*1.)),dtype=numpy.int),
            'corruptlevels':
            corruptlevels * numpy.ones((hlayers, ), dtype=numpy.float32),
            'pretraining_epochs':
            pretraining_epochs,
            'training_epochs':
            training_epochs,
            'pretrain_lr':
            pretrain_lr,
            'finetune_lr':
            finetune_lr,
            'threshold':
            threshold,
            'sda_reuse_model':
            sda_reuse_model,
            'retrain_ft_layers':
            options['retrain_ft_layers'],
            'weight':
            options['weight'],
        }
        if k == 0:
            bestmodeloptions = copy.copy(modeloptions)

        if modeloptions['verbose'] > 2:
            print >> sys.stderr, "######################################################"
            print >> sys.stderr, "                     CROSS-VAL                        "
            print >> sys.stderr, "######################################################"
            print >> sys.stderr, modeloptions

        merror = 0
        merrori = 0
        for cv in range(0, options['folds']):
            counter = step / (len(param) * options['folds'] * 1.)
            print >> sys.stderr, ('###### {t:0{format}.1f}% ({e:0.2f})'.format(
                format=5, t=counter * 100, e=besterror))
            trainset = folds[0]
            valset = folds[1]
            testset = folds[2]

            # print >> sys.stderr, sda_reuse_model
            (sda, pretraining_fns) = build_model(trainset[cv], modeloptions)
            # print >> sys.stderr, sda
            sda = pretrain_finetune_model(sda, pretraining_fns, trainset[cv],
                                          valset[cv], modeloptions)[1]
            merrori = evaluate_model(sda, testset[cv], modeloptions)[0]
            merror = merror + merrori
            # print >> sys.stderr, sda, sda_reuse_model

            step = step + 1
        merror = merror / options['folds']

        if merror < besterror:
            besterror = merror
            bestmodeloptions = copy.copy(modeloptions)

    # print >> sys.stderr, "------------------------------"
    # -------------------------------------------------------------------
    # end of cross validation

    if modeloptions['verbose'] > 0:
        print >> sys.stderr, "######################################################"
        print >> sys.stderr, "                     TRAIN/TEST                       "
        print >> sys.stderr, "######################################################"
    print >> sys.stderr, (bestmodeloptions)
    trainset = folds[3]
    valset = folds[4]
    testset = folds[5]

    bestmodeloptions['savetimes'] = True
    bestmodeloptions['nrun'] = nrun

    # print >> sys.stderr, sda_reuse_model
    start_time = time.clock()
    (sda, pretraining_fns) = build_model(trainset, bestmodeloptions)
    end_time = time.clock()

    pretrain_time = end_time - start_time

    start_time = time.clock()
    sda = pretrain_finetune_model(sda, pretraining_fns, trainset, valset,
                                  bestmodeloptions)[1]
    end_time = time.clock()
    finetune_time = end_time - start_time

    result = evaluate_model(sda, testset, bestmodeloptions)
    # print >> sys.stderr, sda, sda_reuse_model
    print >> sys.stderr, "time pretrain: {0:f} | time fine-tune: {1:f}".format(
        pretrain_time, finetune_time)
    result = result + (pretrain_time, finetune_time)

    filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(
        options['outputfolder'], nrun, string.atoi(options['resolution']))
    save_gzdata(filename, sda)

    filename = '{0:s}/{1:05d}_{2:03d}_options.pkl.gz'.format(
        options['outputfolder'], nrun, string.atoi(options['resolution']))
    save_gzdata(filename, bestmodeloptions)

    return result
Пример #4
0
                if ypred[i] == 0:
                    # red, SdA
                    cv2.circle(img, (int(pti[0]), int(pti[1])), 30,
                               (0, 0, 255), 5)

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.jpg".format(
                imgspath[ids[count]], rd, th, nrunImg, cv)
            print >> sys.stderr, "Saving image..:" + filename
            cv2.imwrite(filename, img)

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}_LoG.pkl.gz".format(
                imgspath[ids[count]], rd, th, nrunImg, cv)
            print >> sys.stderr, "(LoG) Precision: {0:05f} | Recall: {1:05f} ".format(
                Precision_LoG_, Recall_LoG_)
            save_gzdata(filename, [Precision_LoG_, Recall_LoG_])

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.pkl.gz".format(
                imgspath[ids[count]], rd, th, nrunImg, cv)
            print >> sys.stderr, "(SdA) Precision: {0:05f} | Recall: {1:05f} ".format(
                Precision_, Recall_)
            save_gzdata(filename, [Precision_, Recall_])

            print >> sys.stderr, (
                "Ann: {0:05d} | Nano (SdA): {1:05d}| Back (SdA): {2:05d}| LoG: {3:05d} "
            ).format(nmbrAnn, sum(numpy.array(ypred) == 0),
                     sum(numpy.array(ypred) == 1), nelem_x)
            print >> sys.stderr, "-------------------------"

    # average over all files
    Precision = Precision / (nfiles / 2)
Пример #5
0
def main(resolution, method, pathRes):
    # load results from LoG

    imgpathsae = '../../imgs_nanoparticles/{0:03d}/db2/resultado_sae/'.format(
        string.atoi(resolution))

    if method == 'baseline':
        basepath = './{0:s}/{1:05d}/models/res_baseline_resized_{1:05d}_111111/'.format(
            pathRes, string.atoi(resolution))
    elif method == 'tl':
        basepath = './{0:s}/{1:05d}/models/res_tl_resized_50000_{1:05d}_111111/'.format(
            pathRes, string.atoi(resolution))

    # annotations
    annbasepath = '../../imgs_nanoparticles/{0:03d}/db2/annotation/user/'.format(
        string.atoi(resolution))
    annfiles = [
        f for f in os.listdir(annbasepath) if re.match(r'[\w\W]*csv', f)
    ]
    annfiles = sorted(annfiles)

    # imgs base paths
    imgsbasepath = '../../imgs_nanoparticles/{0:03d}/db2/'.format(
        string.atoi(resolution))
    imgspath = os.listdir(imgsbasepath)
    imgspath = sorted(imgspath)

    # ------------------------------------------------------------------------------------------------
    # TEST DATA

    PrecisionAll = []
    RecallAll = []

    PrecisionLoGAll = []
    RecallLoGAll = []

    nDetectionsAll = []

    for nrun in range(1, 21):  #
        print >> sys.stderr, "\n**************************\n"
        print >> sys.stderr, "NRUN {0:05d}/20 ".format(nrun)

        filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(
            basepath, nrun, string.atoi(resolution))
        print >> sys.stderr, "Loading " + filename
        model = load_savedgzdata(filename)

        # get ids
        pathids = '{0:s}/{1:05d}_{2:05d}_test_ids.pkl.gz'.format(
            basepath, nrun, string.atoi(resolution))
        print >> sys.stderr, 'Loading ' + pathids + '...'
        ids = load_savedgzdata(pathids)
        print >> sys.stderr, ids

        reg = 'detectedNanoParticlesDetectionResult_log_detector_test_{0:03d}_'.format(
            nrun)
        files = [f for f in os.listdir(imgpathsae) if re.match(reg, f)]
        # order data
        files = sorted(files)

        nfiles = len(files)

        (Precision, Recall, PrecisionLoG, RecallLoG,
         nDetections) = getPrecisionRecall(nfiles,
                                           files,
                                           ids,
                                           imgpathsae,
                                           imgsbasepath,
                                           imgspath,
                                           annbasepath,
                                           annfiles,
                                           model, (0, 0, nrun, 0),
                                           printImg=True)

        print >> sys.stderr, "Precision LoG: {0:05f} | Recall LoG: {1:05f}".format(
            PrecisionLoG, RecallLoG)
        print >> sys.stderr, "Precision SdA: {0:05f} | Recall SdA: {1:05f}".format(
            Precision, Recall)
        # kaka

        PrecisionAll.append(Precision)
        RecallAll.append(Recall)

        PrecisionLoGAll.append(PrecisionLoG)
        RecallLoGAll.append(RecallLoG)

        nDetectionsAll.append(nDetections)

    # ---------------------------------------------------------
    PrecisionAll = numpy.array(PrecisionAll)
    RecallAll = numpy.array(RecallAll)

    PrecisionLoGAll = numpy.array(PrecisionLoGAll)
    RecallLoGAll = numpy.array(RecallLoGAll)

    nDetectionsAll = numpy.array(nDetectionsAll)

    print "--------------------------------------------\n"
    print "Precision LoG: {0:03f} ({1:03f}) | Recall LoG: {2:03f} ({3:03f})".format(
        numpy.mean(PrecisionLoGAll), numpy.std(PrecisionLoGAll),
        numpy.mean(RecallLoGAll), numpy.std(RecallLoGAll))
    print "Precision SdA: {0:03f} ({1:03f}) | Recall SdA: {2:03f} ({3:03f})".format(
        numpy.mean(PrecisionAll), numpy.std(PrecisionAll),
        numpy.mean(RecallAll), numpy.std(RecallAll))
    print "number detections: {0:03f} ({1:03f})".format(
        numpy.mean(nDetectionsAll), numpy.std(nDetectionsAll))

    PrecisionRecall = numpy.c_[PrecisionAll, RecallAll]
    filename = 'results/sae_{0:s}_{1:s}_test_all.pkl.gz'.format(
        method, resolution)
    save_gzdata(filename, PrecisionRecall)

    PrecisionRecallLoG = numpy.c_[PrecisionLoGAll, RecallLoGAll]
    filename = 'results/log_{0:s}_{1:s}_test_all.pkl.gz'.format(
        method, resolution)
    save_gzdata(filename, PrecisionRecallLoG)

    PrecisionRecall = numpy.r_[numpy.mean(PrecisionAll), numpy.mean(RecallAll)]
    filename = 'results/sae_{0:s}_{1:s}_test.pkl.gz'.format(method, resolution)
    save_gzdata(filename, PrecisionRecall)

    PrecisionRecallLoG = numpy.r_[numpy.mean(PrecisionLoGAll),
                                  numpy.mean(RecallLoGAll)]
    filename = 'results/log_{0:s}_{1:s}_test.pkl.gz'.format(method, resolution)
    save_gzdata(filename, PrecisionRecallLoG)

    filename = 'results/ndetections_{0:s}_{1:s}_test.pkl.gz'.format(
        method, resolution)
    save_gzdata(filename, nDetectionsAll)
Пример #6
0
def do_experiment( folds, options, nrun, sda_reuse_model ):

    modeloptions = {}
    param = list(itertools.product(
        options['nneurons'],
        options['hlayers'],
        options['pretraining_epochs'],
        options['training_epochs'],
        options['pretrain_lr'],
        options['finetune_lr'],
        options['batchsize'],
        options['threshold'],
        options['corruptlevels']
    )
    )

    print >> sys.stderr, param
    
    print >> sys.stderr, ('Number of combinations {0:03d}'.format(len(param)))

    step = 0
    besterror = numpy.inf
    # ---------------------------------------------------------------
    # cross validation
    # ---------------------------------------------------------------
    for k in range(0,len(param)):
        
        (nneurons,
         hlayers,
         pretraining_epochs,
         training_epochs,
         pretrain_lr,
         finetune_lr,
         batchsize,
         threshold,
         corruptlevels) = param[k]

        modeloptions = {
            'savetimes'          : False,
            'outputfolder'       : options['outputfolder'],
            'outputfolderres'    : options['outputfolderres'],
            'resolution'         : options['resolution'],
            'retrain'            : options['retrain'],
            'verbose'            : options['verbose'],
            'ndim'               : options['ndim'],
            'nclasses_source'    : options['nclasses_source'],
            'nclasses'           : options['nclasses'],
            'numpy_rng'          : options['numpy_rng'],
            'theano_rng'         : options['theano_rng'],
            'measure'            : options['measure'],
            'oneclass'           : options['oneclass'],
            'batchsize'          : batchsize,
            'hlayers'            : nneurons * numpy.ones((hlayers,)),
            # numpy.array(nneurons * numpy.ones((hlayers,)) * (1/(2*numpy.arange(1,hlayers+1)*1.)),dtype=numpy.int),
            'corruptlevels'      : corruptlevels*numpy.ones((hlayers,),dtype=numpy.float32),
            'pretraining_epochs' : pretraining_epochs,
            'training_epochs'    : training_epochs,
            'pretrain_lr'        : pretrain_lr,
            'finetune_lr'        : finetune_lr,
            'threshold'          : threshold,
            'sda_reuse_model'    : sda_reuse_model,
            'retrain_ft_layers'  : options['retrain_ft_layers'],
            'weight'             : options['weight'],
        }
        if k == 0:
            bestmodeloptions = copy.copy( modeloptions )

        if modeloptions['verbose'] > 2:
            print >> sys.stderr, "######################################################"
            print >> sys.stderr, "                     CROSS-VAL                        "
            print >> sys.stderr, "######################################################"
            print >> sys.stderr, modeloptions

        merror  = 0
        merrori = 0
        for cv in range(0,options['folds']):
            counter = step/(len(param)*options['folds']*1.)
            print >> sys.stderr, ('###### {t:0{format}.1f}% ({e:0.2f})'.format(format=5,t=counter*100,e=besterror) )
            trainset = folds[0]
            valset   = folds[1]
            testset  = folds[2]

            # print >> sys.stderr, sda_reuse_model
            (sda,pretraining_fns) = build_model(trainset[cv],modeloptions)
            # print >> sys.stderr, sda
            sda  = pretrain_finetune_model(sda,pretraining_fns,
                                           trainset[cv],
                                           valset[cv],
                                           modeloptions)[1]
            merrori = evaluate_model(sda,testset[cv],modeloptions)[0]
            merror  = merror + merrori
            # print >> sys.stderr, sda, sda_reuse_model
            
            step = step + 1
        merror = merror / options['folds']

        if merror < besterror:
            besterror        = merror
            bestmodeloptions = copy.copy( modeloptions )

    # print >> sys.stderr, "------------------------------"
    # -------------------------------------------------------------------
    # end of cross validation
    
    if modeloptions['verbose'] > 0:
        print >> sys.stderr, "######################################################"
        print >> sys.stderr, "                     TRAIN/TEST                       "
        print >> sys.stderr, "######################################################"
    print >> sys.stderr, (bestmodeloptions)
    trainset = folds[3]
    valset   = folds[4]
    testset  = folds[5]
    
    bestmodeloptions['savetimes'] = True 
    bestmodeloptions['nrun']      = nrun

    # print >> sys.stderr, sda_reuse_model
    start_time = time.clock()
    (sda,pretraining_fns) = build_model(trainset, bestmodeloptions)
    end_time = time.clock()
    
    pretrain_time = end_time - start_time

    start_time = time.clock()
    sda = pretrain_finetune_model(sda, pretraining_fns,
                                  trainset,
                                  valset,
                                  bestmodeloptions)[1]
    end_time = time.clock()
    finetune_time = end_time - start_time
    
    result = evaluate_model( sda, testset, bestmodeloptions )
    # print >> sys.stderr, sda, sda_reuse_model
    print >> sys.stderr, "time pretrain: {0:f} | time fine-tune: {1:f}".format(pretrain_time, finetune_time)
    result = result + ( pretrain_time, finetune_time )

    filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(options['outputfolder'],nrun,string.atoi(options['resolution']))
    save_gzdata(filename, sda)

    filename = '{0:s}/{1:05d}_{2:03d}_options.pkl.gz'.format(options['outputfolder'],nrun,string.atoi(options['resolution']))
    save_gzdata(filename, bestmodeloptions)
    
    return result
Пример #7
0
def pretrain_finetune_model(sda,pretraining_fns,train_set,test_set,options):
    train_set_x, train_set_y = train_set

    n_train_batches  = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= options['batchsize']

    if options['retrain'] == 0:    
    
        bestmodelsda = copy.copy( sda )
        # -----------------------------------------------
        # PRETRAINING
        # -----------------------------------------------  
        if options['verbose'] > 5:
            print >> sys.stderr, ('... pre-training the model')
        start_time = time.clock()
        ## Pre-train layer-wise
        corruption_levels = options['corruptlevels']
        for i in xrange(sda.n_layers):
            # go through pretraining epochs
            for epoch in xrange(options['pretraining_epochs']):
                # go through the training set
                c = []
                for batch_index in xrange(n_train_batches):
                    c.append(pretraining_fns[i](index=batch_index,
                                                corruption=corruption_levels[i],
                                                lr=options['pretrain_lr']))

                if epoch % 100 == 0 and options['verbose'] > 5:
                    print >> sys.stderr, ('Pre-training layer %02i, epoch %04d, cost ' % (i, epoch)),
                    print >> sys.stderr, (numpy.mean(c))
        end_time = time.clock()
        if options['savetimes']:
            filename = '{0:s}/times_pr_{1:03d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],options['nrun'],string.atoi(options['resolution']))
            save_gzdata(filename, end_time - start_time)
        
        if options['verbose'] > 4:
            print  >> sys.stderr, ('The pretraining code for file ' +
                                   os.path.split(__file__)[1] +
                                   ' ran for %.2fm' % ((end_time - start_time) / 60.))

        # get the training, validation and testing function for the model
        #dataset = [folds[0][0], folds[1][0], folds[2]]
        dataset = [train_set, test_set]
    
        if options['verbose'] > 5:
            print >> sys.stderr,('... getting the finetuning functions')
        train_fn, validate_model = sda.build_finetune_functions(
            datasets=dataset,
            batch_size=options['batchsize'],
            learning_rate=options['finetune_lr']
        )

    else:
        dataset = [train_set, test_set]

        train_fn, validate_model = sda.build_finetune_functions_reuse(
            datasets=dataset, batch_size=options['batchsize'],
            learning_rate=options['finetune_lr'], update_layerwise=options['retrain_ft_layers'])
        
    # ------------------------------------------------------------------------------------------------
        
    # -----------------------------------------------
    # FINETUNE
    # -----------------------------------------------  
    if options['verbose'] > 5:
        print >> sys.stderr, ('... finetunning the model')
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 2. #2. # wait this much longer when a new best is found
    improvement_threshold = 0.995 # 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.

    start_time = time.clock()

    done_looping = False
    epoch = 0

    while (epoch < options['training_epochs']) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_fn(minibatch_index)

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                #this_validation_loss    = numpy.mean( validate_model() )
                (y_valid, y_pred, y_pred_prob)  = validate_model()

                # pos = numpy.random.randint(len(y_pred),size=(100,))
                # print options
                # print pos
                # print y_pred_prob[pos,:].T
                # raw_input()
                # alll

                # we are going to control the predictions according to their prob
                if options['threshold'] != None:
                    y_pred = numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8)
                else:
                    y_pred = numpy.argmax( y_pred_prob, axis = 1 )

                this_validation_loss = evaluate_error( y_valid, y_pred, options )

                # if epoch % 10 == 0:
                #     cm = confusion_matrix(y_valid, y_pred, options['nclasses'])
                #     print >> sys.stderr, cm, this_validation_loss
                    
                # print >> sys.stderr, this_validation_loss
                # this_validation_loss = numpy.mean(validation_losses)
                if  epoch % 30 == 0 and options['verbose'] > 5:
                    # print >> sys.stderr, y_valid
                    # print >> sys.stderr, y_pred_prob
                    # print >> sys.stderr, y_pred
                    # print >> sys.stderr, y_valid.shape
                    # print >> sys.stderr, y_pred.shape
                    # print >> sys.stderr, y_pred_prob[1:10,:], y_pred[1:10], y_valid[1:10]
                    # print >> sys.stderr, test_set[1].eval()
                    
                    print >> sys.stderr,('epoch %04i, minibatch %04i/%04i, validation error %03f %%' %
                                         (epoch, minibatch_index + 1, n_train_batches,
                                          this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    bestmodelsda = copy.copy(sda)

                    # % ------------------------------------------------------------
                    if options['oneclass'] == True:
                        options['nclasses'] = 2

                    #print >> sys.stderr, sda.params[-2].get_value().T, sda.params[-1].get_value()
                    pos = numpy.random.randint(len(y_pred),size=(10,))
                    # print options
                    # print pos
                    # print y_pred_prob.shape
                    #print >> sys.stderr, options['threshold']
                    # print >> sys.stderr, numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8)
                    #print >> sys.stderr, y_pred_prob[pos,:].T
                    #print >> sys.stderr, y_pred[pos]
                    cm = confusion_matrix(y_valid, y_pred, options['nclasses'])
                    #print >> sys.stderr, ("Fine tune...epoch %04i" %  epoch)
                    #print >> sys.stderr, this_validation_loss
                    #print >> sys.stderr, cm
                    # options['nclasses'] = 1
                    # % ------------------------------------------------------------

                    
                    # improve patience if loss improvement is good enough
                    if (
                            this_validation_loss < best_validation_loss *
                            improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    if patience <= iter:
                        done_looping = True
                        break

    end_time = time.clock()

    if options['savetimes']:
        filename = '{0:s}/times_fn_{1:03d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],options['nrun'],string.atoi(options['resolution']))
        save_gzdata(filename, end_time - start_time)

    print >> sys.stderr, ("Stopped at epoch %04i" % epoch )
    return (best_validation_loss,bestmodelsda)
Пример #8
0
def main(resolution,method,pathRes):
    # load results from LoG
    
    imgpathsae  = '../../imgs_nanoparticles/{0:03d}/db2/resultado_sae/'.format(string.atoi(resolution))

    if method == 'baseline':
        basepath = './{0:s}/{1:05d}/models/res_baseline_resized_{1:05d}_111111/'.format(pathRes,string.atoi(resolution))
    elif method == 'tl':
        basepath = './{0:s}/{1:05d}/models/res_tl_resized_50000_{1:05d}_111111/'.format(pathRes,string.atoi(resolution))
    
    # annotations
    annbasepath = '../../imgs_nanoparticles/{0:03d}/db2/annotation/user/'.format(string.atoi(resolution))
    annfiles = [f for f in os.listdir(annbasepath) if re.match(r'[\w\W]*csv', f)]
    annfiles = sorted( annfiles )

    # imgs base paths
    imgsbasepath = '../../imgs_nanoparticles/{0:03d}/db2/'.format(string.atoi(resolution))
    imgspath = os.listdir(imgsbasepath)
    imgspath = sorted( imgspath )

    # ------------------------------------------------------------------------------------------------
    # TEST DATA

    PrecisionAll = []
    RecallAll    = []

    PrecisionLoGAll = []
    RecallLoGAll    = []

    nDetectionsAll  = []
    
    for nrun in range(1,21): #
        print >> sys.stderr, "\n**************************\n"
        print >> sys.stderr, "NRUN {0:05d}/20 ".format(nrun)
        
        filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(basepath,nrun,string.atoi(resolution))
        print >> sys.stderr, "Loading " + filename
        model    = load_savedgzdata(filename)

        # get ids
        pathids = '{0:s}/{1:05d}_{2:05d}_test_ids.pkl.gz'.format(basepath,nrun,string.atoi(resolution))
        print >> sys.stderr, 'Loading ' + pathids + '...'
        ids = load_savedgzdata(pathids)
        print >> sys.stderr, ids
        
        reg = 'detectedNanoParticlesDetectionResult_log_detector_test_{0:03d}_'.format(nrun)
        files = [f for f in os.listdir(imgpathsae) if re.match(reg, f)]
        # order data
        files = sorted( files )
        
        nfiles = len(files)

        (Precision, Recall, PrecisionLoG,RecallLoG,nDetections) = getPrecisionRecall(nfiles,files,ids,imgpathsae,imgsbasepath,imgspath,annbasepath,annfiles,model,(0,0,nrun,0),printImg=True)
        
        print >> sys.stderr, "Precision LoG: {0:05f} | Recall LoG: {1:05f}".format(PrecisionLoG, RecallLoG)
        print >> sys.stderr, "Precision SdA: {0:05f} | Recall SdA: {1:05f}".format(Precision, Recall)
        # kaka
        
        PrecisionAll.append( Precision )
        RecallAll.append( Recall )

        PrecisionLoGAll.append( PrecisionLoG )
        RecallLoGAll.append( RecallLoG )

        nDetectionsAll.append( nDetections )
        
    # ---------------------------------------------------------
    PrecisionAll = numpy.array( PrecisionAll ) 
    RecallAll    = numpy.array( RecallAll )

    PrecisionLoGAll = numpy.array( PrecisionLoGAll )
    RecallLoGAll    = numpy.array( RecallLoGAll )

    nDetectionsAll  = numpy.array( nDetectionsAll )
    
    print "--------------------------------------------\n"
    print "Precision LoG: {0:03f} ({1:03f}) | Recall LoG: {2:03f} ({3:03f})".format(numpy.mean(PrecisionLoGAll),numpy.std(PrecisionLoGAll),numpy.mean(RecallLoGAll),numpy.std(RecallLoGAll))
    print "Precision SdA: {0:03f} ({1:03f}) | Recall SdA: {2:03f} ({3:03f})".format(numpy.mean(PrecisionAll),numpy.std(PrecisionAll),numpy.mean(RecallAll),numpy.std(RecallAll))
    print "number detections: {0:03f} ({1:03f})".format(numpy.mean(nDetectionsAll),numpy.std(nDetectionsAll))
    
    PrecisionRecall = numpy.c_[PrecisionAll,RecallAll]
    filename = 'results/sae_{0:s}_{1:s}_test_all.pkl.gz'.format(method,resolution)
    save_gzdata(filename, PrecisionRecall )

    PrecisionRecallLoG = numpy.c_[PrecisionLoGAll,RecallLoGAll]
    filename = 'results/log_{0:s}_{1:s}_test_all.pkl.gz'.format(method,resolution)
    save_gzdata(filename, PrecisionRecallLoG )

    PrecisionRecall = numpy.r_[numpy.mean(PrecisionAll),numpy.mean(RecallAll)]
    filename = 'results/sae_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution)
    save_gzdata(filename, PrecisionRecall )

    PrecisionRecallLoG = numpy.r_[numpy.mean(PrecisionLoGAll),numpy.mean(RecallLoGAll)]
    filename = 'results/log_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution)
    save_gzdata(filename, PrecisionRecallLoG )

    filename = 'results/ndetections_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution)
    save_gzdata(filename, nDetectionsAll )
Пример #9
0
                if ypredlog[i] == 0:
                    # blue, log
                    cv2.circle(img,(int(pti[0]),int(pti[1])),20,(255,0,0),5)

                if ypred[i] == 0:
                    # red, SdA
                    cv2.circle(img,(int(pti[0]),int(pti[1])),30,(0,0,255),5)

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.jpg".format(imgspath[ids[count]],rd,th,nrunImg,cv)
            print >> sys.stderr, "Saving image..:" + filename
            cv2.imwrite(filename,img)

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}_LoG.pkl.gz".format(imgspath[ids[count]],rd,th,nrunImg,cv)
            print >> sys.stderr, "(LoG) Precision: {0:05f} | Recall: {1:05f} ".format(Precision_LoG_, Recall_LoG_)
            save_gzdata(filename,[Precision_LoG_,Recall_LoG_])

            filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.pkl.gz".format(imgspath[ids[count]],rd,th,nrunImg,cv)
            print >> sys.stderr, "(SdA) Precision: {0:05f} | Recall: {1:05f} ".format(Precision_, Recall_)
            save_gzdata(filename,[Precision_,Recall_])

            print >> sys.stderr, ("Ann: {0:05d} | Nano (SdA): {1:05d}| Back (SdA): {2:05d}| LoG: {3:05d} ").format(nmbrAnn, sum(numpy.array(ypred)==0), sum(numpy.array(ypred)==1), nelem_x)
            print >> sys.stderr, "-------------------------"

            
    # average over all files
    Precision = Precision / (nfiles/2)
    Recall    = Recall / (nfiles/2)

    Precision_LoG = Precision_LoG / (nfiles/2)
    Recall_LoG    = Recall_LoG / (nfiles/2)
Пример #10
0
def gen_folds(dataset, options, nrun):
    nids = len(set(dataset[0, :]))
    ids = options['numpy_rng'].permutation(nids)

    # train / test ids
    trainsizeElem = round(options['trainsize'] * nids)
    train_ids = ids[0:trainsizeElem]
    test_ids = ids[trainsizeElem + 1:nids]

    print >> sys.stderr, test_ids

    if options['verbose'] > 2:
        print >> sys.stderr, "Train IDS"
        print >> sys.stderr, train_ids
        print >> sys.stderr, "Test IDS"
        print >> sys.stderr, test_ids

    # val ids
    val_ids = numpy.copy(train_ids)
    nitems = len(val_ids) / options['folds']

    val_ids.resize((options['folds'], nitems))

    folds = range(0, options['folds'])

    trainval = []
    valval = []
    testval = []
    for k in folds:
        others = list(set([k]).symmetric_difference(set(folds)))
        #print val_ids
        #print others
        #kk

        train = val_ids[k].flatten()
        val = val_ids[others[0]]
        test = val_ids[others[1]]

        xtrain, ytrain, minv, maxv = get_data(dataset, train, options)
        xval, yval = get_data(dataset,
                              val,
                              options,
                              isFirst=False,
                              minvalue=minv,
                              maxvalue=maxv)[0:2]
        xtest, ytest = get_data(dataset,
                                test,
                                options,
                                isFirst=False,
                                minvalue=minv,
                                maxvalue=maxv)[0:2]

        trainval.append((xtrain, ytrain))
        valval.append((xval, yval))
        testval.append((xtest, ytest))

        if options['verbose'] > 0:
            print 'Train set with size %d for fold %d' % (ytrain.shape.eval(),
                                                          k)
            print 'Test  set with size %d for fold %d' % (ytest.shape.eval(),
                                                          k)
            if options['verbose'] > 5:
                for cls in range(0, 2):
                    print >> sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(
                        cls, sum(ytrain.eval() == cls))
                    print >> sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(
                        cls, sum(ytest.eval() == cls))

    # final ids
    final_ids = numpy.copy(train_ids)
    nitems = len(final_ids) / 2
    final_ids.resize((2, nitems))
    trainfinal_ids = final_ids[0]
    valfinal_ids = final_ids[1]

    xtrain, ytrain, minv, maxv = get_data(dataset, trainfinal_ids, options)
    xval, yval = get_data(dataset,
                          valfinal_ids,
                          options,
                          isFirst=False,
                          minvalue=minv,
                          maxvalue=maxv)[0:2]
    xtest, ytest = get_data(dataset,
                            test_ids,
                            options,
                            isFirst=True,
                            minvalue=minv,
                            maxvalue=maxv)[0:2]

    trainFinal = (xtrain, ytrain)
    valFinal = (xval, yval)
    testFinal = (xtest, ytest)

    print >> sys.stderr, test_ids

    if options['verbose'] > 0:
        print 'Train set with size %d ' % (ytrain.shape.eval())
        print 'Val  set with size %d ' % (yval.shape.eval())
        print 'Test  set with size %d ' % (ytest.shape.eval())
        if options['verbose'] > 5:
            for cls in range(0, 2):
                print >> sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(
                    cls, sum(ytrain.eval() == cls))
                print >> sys.stderr, "\tNumber of validation elements for cls {0:02d} is {1:05d}".format(
                    cls, sum(yval.eval() == cls))
                print >> sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(
                    cls, sum(ytest.eval() == cls))

    basefilename = '{0:s}/{1:05d}_{2:03d}_'.format(
        options['outputfolder'], nrun, string.atoi(options['resolution']))

    trainfilename = basefilename + 'train_ids.pkl.gz'
    valfilename = basefilename + 'val_ids.pkl.gz'
    trainfinalfilename = basefilename + 'trainfinal_ids.pkl.gz'
    valfinalfilename = basefilename + 'valfinal_ids.pkl.gz'
    testfilename = basefilename + 'test_ids.pkl.gz'

    save_gzdata(trainfilename, train_ids)
    save_gzdata(valfilename, val_ids)
    save_gzdata(trainfinalfilename, trainfinal_ids)
    save_gzdata(valfinalfilename, valfinal_ids)
    save_gzdata(testfilename, test_ids)

    if options['verbose'] > 0:
        print 'Train set with size %d' % (trainFinal[1].shape.eval())
        print 'Val   set with size %d' % (valFinal[1].shape.eval())
        print 'Test  set with size %d' % (testFinal[1].shape.eval())

    rval = [trainval, valval, testval, trainFinal, valFinal, testFinal]

    return rval