def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True): from unbalanced_dataset import SMOTE from sklearn.metrics import confusion_matrix as confmat from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score columns = [] if type(data) is not np.ndarray: data = data.as_matrix() if type(labels) is not np.ndarray: labels = labels.as_matrix().ravel() skf = StratifiedKFold(labels,n_folds=folds, shuffle=False) sets = [{'train':train, 'test':test} for train, test in skf] acc = [] fmeasure = [] recall = [] precision = [] cm = np.array([0, 0, 0, 0]).reshape(2,2) for fold in sets: data_train = data[fold['train']] labels_train = labels[fold['train']] bugs = sum(labels_train) ratio = float(len(labels_train)-bugs)/bugs data_test = data[fold['test']] labels_test = labels[fold['test']] if runSMOTE: smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1') data_train, labels_train = smote.fit_transform(data_train,labels_train) clf.fit(data_train, labels_train) hypot = clf.predict(data_test) acc.append(accuracy_score(hypot, labels_test)) fmeasure.append(f1_score(hypot, labels_test)) recall.append(recall_score(hypot, labels_test)) precision.append(precision_score(hypot, labels_test)) cm += confmat(labels_test, hypot) return acc, fmeasure, recall, precision, cm
def doNetRun(DoPretrain, actType, numHidden, numNodes, dropOut, NumReps, AdjustWforDropout, L1, L2, LearningRate, Momentum, Algorithm, maxUpdate, BatchSize, Patience, MinImprovement, ValidateEveryN): '''Performs *numRep* neural network runs, saving the best run so far (global var) :params: NN hyper-parameters. :returns: run parameters + performance measures (DataFrame).''' layerDef = dict(size=numNodes, activation=actType) netDef = [] netDef.append(numFeatures) for _ in range(numHidden): netDef.append(layerDef) netDef.append(numCats) frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) saveCols = [] for arg in args: print arg, ': ', values[arg], ',', saveCols.append(arg) print saveCols = saveCols + \ ['TrLoss', 'VldLoss', 'TstLoss', 'TestAcc', 'Time', 'BestEpoch'] hist = pd.DataFrame(columns=saveCols) for _ in range(NumReps): global kountRuns kountRuns = kountRuns + 1 # use new seed for each run ii32 = np.iinfo(np.int32) seed = random.randint(0, ii32.max) print 'seed: ', seed # although numpy.random.seed should be uint32, Theanets only checks for # int # and fails if given a uint32 stored as a long in Python net = thts.Classifier(layers=netDef, rng=seed) t0 = time.clock() Epoch = 0 if DoPretrain: print('Train phase I:') net.train(train, valid, patience=Patience, learning_rate=LearningRate, momentum=Momentum, min_improvement=MinImprovement, validate_every=ValidateEveryN, max_updates=maxUpdate, input_dropout=dropOut, hidden_dropout=dropOut, algo='layerwise', weight_l1=L1, # L1 norm sparsity weight_l2=L2, # L2 norm weight decay batch_size=BatchSize) print('Train phase II:') Epoch = 0 lastLoss = np.Inf lastEpoch = 0 for tr, vl in net.itertrain(train, valid, patience=Patience, learning_rate=LearningRate, momentum=Momentum, min_improvement=MinImprovement, validate_every=ValidateEveryN, max_updates=maxUpdate, input_dropout=dropOut, hidden_dropout=dropOut, algo=Algorithm, weight_l1=L1, # L1 norm sparsity weight_l2=L2, # L2 norm weight decay batch_size=BatchSize): Epoch = Epoch + 1 vloss = vl['loss'] if (lastLoss - vloss) >= MinImprovement: lastLoss = vloss lastEpoch = Epoch flg = ' *' + str(lastEpoch) else: flg = '' print Epoch, 'trLoss: %.4f' % tr['loss'], ' vlLoss: %.4f' % vloss, \ ' vlacc: %.4f' % vl['acc'], flg t1 = time.clock() - t0 print 'Time: ', t1, ' Epochs:', Epoch if AdjustWforDropout: fact = 1.0 - dropOut for ll in net.layers: if (ll.name != 'in'): w = net.find(ll.name, 'w') w.set_value(w.get_value() * fact) X, y = train trnLoss = log_loss(y, net.predict_proba(X)) X, y = valid vldLoss = log_loss(y, net.predict_proba(X)) X, y = test ypp = net.predict_proba(X) yp = net.predict(X) acc = net.score(X, y) tstLoss = log_loss(y, ypp) print Epoch, 'trLoss: %.4f' % trnLoss, ' vlLoss: %.4f' % vldLoss, \ ' vlacc: %.4f' % acc print 'Best Epoch: ', lastEpoch cf = confmat(y, yp) print 'Test-set confusion matrix:' print cf global bestLoss global bestParams dta = dict() for arg in args: dta[arg] = values[arg] dta['TrLoss'] = trnLoss dta['VldLoss'] = vldLoss dta['TstLoss'] = tstLoss dta['TestAcc'] = acc dta['Time'] = t1 dta['BestEpoch'] = lastEpoch nr = pd.DataFrame([dta]) if (tstLoss <= bestLoss): bestLoss = tstLoss net.save('bestModel') bestParams = nr hist = hist.append(nr, ignore_index=True) # re-order columns... hist = hist[saveCols] return hist
def fit(self, dataloaders, criterion, optimizer, scheduler, num_epochs=(0, 500)): #pdb.set_trace() epoch_stats = Stats(phases=['train', 'val']) dataset_sizes = { x: len(dataloaders[x].dataset) for x in ['train', 'val'] } # class_names = dataloaders['train'].dataset.classes #best_model_W = copy.deepcopy(self.model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs[0], num_epochs[1]): for phase in ['train', 'val']: if phase == 'train': scheduler.step() self.model.train() # Set model to training mode else: self.model.eval() # Set model to evaluate mode running_loss_epoch = 0.0 running_corrects_epoch = 0.0 running_loss = 0.0 # Iterate over data. y_preds = torch.LongTensor().to(self.processor) y_trues = torch.LongTensor().to(self.processor) for i, (inputs, labels) in enumerate(dataloaders[phase]): inputs = inputs.to(self.processor) labels = labels.to(self.processor) # zero the parameter gradients optimizer.zero_grad() # forward with torch.set_grad_enabled(phase == 'train'): outputs = self.model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() if i % 100 == 99: # print every 50 minibatches print('[%d, %d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) running_loss = 0.0 running_loss_epoch += loss.item() * inputs.size(0) running_corrects_epoch += torch.sum(preds == labels).item() #pdb.set_trace() y_preds = torch.cat((y_preds, preds), dim=0) y_trues = torch.cat((y_trues, labels), dim=0) epoch_loss = running_loss_epoch / dataset_sizes[phase] epoch_acc = running_corrects_epoch / dataset_sizes[phase] print(f'{self.model_name} at {self.model_dir}') print('Epoch #{} {} Loss: {:.3f}'.format( epoch + 1, phase, epoch_loss)) print('Epoch #{} {} Accuracy: {:.3f}'.format( epoch + 1, phase, epoch_acc)) print(confmat(y_trues.cpu().numpy(), y_preds.cpu().numpy())) # keeping track of the epoch progress epoch_stats(phase, epoch_loss, epoch_acc) epoch_stats.write2file(self.model_dir) # deep copy the best model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc self.save_model() # write model to file at every 50 epochs. if epoch % 50 == 49: self.save_model(epoch + 1)