def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True):
    from unbalanced_dataset import SMOTE
    from sklearn.metrics import confusion_matrix as confmat
    from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
    columns = []
    
    if type(data) is not np.ndarray:
        data = data.as_matrix()
        
    if type(labels) is not np.ndarray:
        labels = labels.as_matrix().ravel()
    
    skf = StratifiedKFold(labels,n_folds=folds, shuffle=False)
    sets = [{'train':train, 'test':test} for train, test in skf]
    acc = []
    fmeasure = []
    recall = []
    precision = []
    cm = np.array([0, 0, 0, 0]).reshape(2,2)
    
    for fold in sets:
        data_train = data[fold['train']]
        labels_train = labels[fold['train']]
        
    
        bugs = sum(labels_train)
        ratio = float(len(labels_train)-bugs)/bugs
        
        data_test = data[fold['test']]
        labels_test = labels[fold['test']]
        if runSMOTE:
            smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
            data_train, labels_train = smote.fit_transform(data_train,labels_train)
        
        clf.fit(data_train, labels_train)
        hypot = clf.predict(data_test)
        
        acc.append(accuracy_score(hypot, labels_test))
        fmeasure.append(f1_score(hypot, labels_test))
        recall.append(recall_score(hypot, labels_test))
        precision.append(precision_score(hypot, labels_test))
        
        cm += confmat(labels_test, hypot)
        
    return acc, fmeasure, recall, precision, cm
예제 #2
0
def doNetRun(DoPretrain, actType, numHidden, numNodes, dropOut, NumReps,
             AdjustWforDropout, L1, L2,
             LearningRate, Momentum, Algorithm, maxUpdate, BatchSize, Patience,
             MinImprovement, ValidateEveryN):
    '''Performs *numRep* neural network runs, saving the best run so far
    (global var)
    :params: NN hyper-parameters.
    :returns: run parameters + performance measures (DataFrame).'''

    layerDef = dict(size=numNodes, activation=actType)

    netDef = []
    netDef.append(numFeatures)
    for _ in range(numHidden):
        netDef.append(layerDef)
    netDef.append(numCats)

    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    saveCols = []
    for arg in args:
        print arg, ': ', values[arg], ',',
        saveCols.append(arg)
    print

    saveCols = saveCols + \
        ['TrLoss', 'VldLoss', 'TstLoss', 'TestAcc', 'Time', 'BestEpoch']

    hist = pd.DataFrame(columns=saveCols)

    for _ in range(NumReps):
        global kountRuns
        kountRuns = kountRuns + 1

        # use new seed for each run
        ii32 = np.iinfo(np.int32)
        seed = random.randint(0, ii32.max)
        print 'seed: ', seed
        # although numpy.random.seed should be uint32, Theanets only checks for
        # int
        # and fails if given a uint32 stored as a long in Python

        net = thts.Classifier(layers=netDef, rng=seed)

        t0 = time.clock()
        Epoch = 0
        if DoPretrain:
            print('Train phase I:')
            net.train(train, valid,
                      patience=Patience,
                      learning_rate=LearningRate,
                      momentum=Momentum,
                      min_improvement=MinImprovement,
                      validate_every=ValidateEveryN,
                      max_updates=maxUpdate,
                      input_dropout=dropOut,
                      hidden_dropout=dropOut,
                      algo='layerwise',
                      weight_l1=L1,  # L1 norm sparsity
                      weight_l2=L2,  # L2 norm weight decay
                      batch_size=BatchSize)

        print('Train phase II:')
        Epoch = 0
        lastLoss = np.Inf
        lastEpoch = 0
        for tr, vl in net.itertrain(train, valid,
                                    patience=Patience,
                                    learning_rate=LearningRate,
                                    momentum=Momentum,
                                    min_improvement=MinImprovement,
                                    validate_every=ValidateEveryN,
                                    max_updates=maxUpdate,
                                    input_dropout=dropOut,
                                    hidden_dropout=dropOut,
                                    algo=Algorithm,
                                    weight_l1=L1,  # L1 norm sparsity
                                    weight_l2=L2,  # L2 norm weight decay
                                    batch_size=BatchSize):
            Epoch = Epoch + 1
            vloss = vl['loss']

            if (lastLoss - vloss) >= MinImprovement:
                lastLoss = vloss
                lastEpoch = Epoch
                flg = ' *' + str(lastEpoch)
            else:
                flg = ''
            print Epoch, 'trLoss: %.4f' % tr['loss'], ' vlLoss: %.4f' % vloss, \
                ' vlacc: %.4f' % vl['acc'], flg

        t1 = time.clock() - t0
        print 'Time: ', t1, ' Epochs:', Epoch

        if AdjustWforDropout:
            fact = 1.0 - dropOut
            for ll in net.layers:
                if (ll.name != 'in'):
                    w = net.find(ll.name, 'w')
                    w.set_value(w.get_value() * fact)

        X, y = train
        trnLoss = log_loss(y, net.predict_proba(X))

        X, y = valid
        vldLoss = log_loss(y, net.predict_proba(X))

        X, y = test
        ypp = net.predict_proba(X)
        yp = net.predict(X)
        acc = net.score(X, y)
        tstLoss = log_loss(y, ypp)

        print Epoch, 'trLoss: %.4f' % trnLoss, ' vlLoss: %.4f' % vldLoss, \
            ' vlacc: %.4f' % acc
        print 'Best Epoch: ', lastEpoch

        cf = confmat(y, yp)
        print 'Test-set confusion matrix:'
        print cf

        global bestLoss
        global bestParams

        dta = dict()

        for arg in args:
            dta[arg] = values[arg]

        dta['TrLoss'] = trnLoss
        dta['VldLoss'] = vldLoss
        dta['TstLoss'] = tstLoss
        dta['TestAcc'] = acc
        dta['Time'] = t1
        dta['BestEpoch'] = lastEpoch

        nr = pd.DataFrame([dta])

        if (tstLoss <= bestLoss):
            bestLoss = tstLoss
            net.save('bestModel')
            bestParams = nr

        hist = hist.append(nr, ignore_index=True)

        # re-order columns...
        hist = hist[saveCols]

    return hist
예제 #3
0
    def fit(self,
            dataloaders,
            criterion,
            optimizer,
            scheduler,
            num_epochs=(0, 500)):
        #pdb.set_trace()
        epoch_stats = Stats(phases=['train', 'val'])
        dataset_sizes = {
            x: len(dataloaders[x].dataset)
            for x in ['train', 'val']
        }
        # class_names     = dataloaders['train'].dataset.classes

        #best_model_W  = copy.deepcopy(self.model.state_dict())
        best_acc = 0.0
        for epoch in range(num_epochs[0], num_epochs[1]):
            for phase in ['train', 'val']:
                if phase == 'train':
                    scheduler.step()
                    self.model.train()  # Set model to training mode
                else:
                    self.model.eval()  # Set model to evaluate mode

                running_loss_epoch = 0.0
                running_corrects_epoch = 0.0
                running_loss = 0.0

                # Iterate over data.
                y_preds = torch.LongTensor().to(self.processor)
                y_trues = torch.LongTensor().to(self.processor)
                for i, (inputs, labels) in enumerate(dataloaders[phase]):
                    inputs = inputs.to(self.processor)
                    labels = labels.to(self.processor)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = self.model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    running_loss += loss.item()
                    if i % 100 == 99:  # print every 50 minibatches
                        print('[%d, %d] loss: %.3f' %
                              (epoch + 1, i + 1, running_loss / 100))
                        running_loss = 0.0

                    running_loss_epoch += loss.item() * inputs.size(0)
                    running_corrects_epoch += torch.sum(preds == labels).item()

                    #pdb.set_trace()
                    y_preds = torch.cat((y_preds, preds), dim=0)
                    y_trues = torch.cat((y_trues, labels), dim=0)

                epoch_loss = running_loss_epoch / dataset_sizes[phase]
                epoch_acc = running_corrects_epoch / dataset_sizes[phase]
                print(f'{self.model_name} at {self.model_dir}')
                print('Epoch #{} {} Loss: {:.3f}'.format(
                    epoch + 1, phase, epoch_loss))
                print('Epoch #{} {} Accuracy: {:.3f}'.format(
                    epoch + 1, phase, epoch_acc))
                print(confmat(y_trues.cpu().numpy(), y_preds.cpu().numpy()))

                # keeping track of the epoch progress
                epoch_stats(phase, epoch_loss, epoch_acc)
                epoch_stats.write2file(self.model_dir)

                # deep copy the best model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    self.save_model()

            # write model to file at every 50 epochs.
            if epoch % 50 == 49:
                self.save_model(epoch + 1)