Exemplo n.º 1
0
    def run(self):
        logging.info(
            'Training {0} with (inner) {1}-fold cross-validation'.format(
                self.modelname, self.k))

        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
               [2**t for t in range(-2, 4, 1)]
        skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111)
        innerskf = StratifiedKFold(n_splits=self.k,
                                   shuffle=True,
                                   random_state=1111)
        count = 0
        for train_idx, test_idx in skf.split(self.X, self.y):
            count += 1
            X_train, X_test = self.X[train_idx], self.X[test_idx]
            y_train, y_test = self.y[train_idx], self.y[test_idx]
            scores = []
            for reg in regs:
                regscores = []
                for inner_train_idx, inner_test_idx in innerskf.split(
                        X_train, y_train):
                    X_in_train, X_in_test = X_train[inner_train_idx], \
                                            X_train[inner_test_idx]
                    y_in_train, y_in_test = y_train[inner_train_idx], \
                                            y_train[inner_test_idx]
                    if self.usepytorch:
                        if self.classifier == 'LogReg':
                            clf = LogReg(inputdim=self.featdim,
                                         nclasses=self.nclasses,
                                         l2reg=reg,
                                         seed=self.seed)
                        elif self.classifier == 'MLP':
                            clf = MLP(inputdim=self.featdim,
                                      hiddendim=self.nhid,
                                      nclasses=self.nclasses,
                                      l2reg=reg,
                                      seed=self.seed)
                        clf.fit(X_in_train,
                                y_in_train,
                                validation_data=(X_in_test, y_in_test))
                    else:
                        clf = LogisticRegression(C=reg, random_state=self.seed)
                        clf.fit(X_in_train, y_in_train)
                    regscores.append(clf.score(X_in_test, y_in_test))
                scores.append(round(100 * np.mean(regscores), 2))
            optreg = regs[np.argmax(scores)]
            logging.info('Best param found at split {0}: l2reg = {1} \
                with score {2}'.format(count, optreg, np.max(scores)))
            self.devresults.append(np.max(scores))

            if self.usepytorch:
                if self.classifier == 'LogReg':
                    clf = LogReg(inputdim=self.featdim,
                                 nclasses=self.nclasses,
                                 l2reg=optreg,
                                 seed=self.seed)
                elif self.classifier == 'MLP':
                    clf = MLP(inputdim=self.featdim,
                              hiddendim=self.nhid,
                              nclasses=self.nclasses,
                              l2reg=optreg,
                              seed=self.seed)
                clf.fit(X_train, y_train, validation_split=0.05)
            else:
                clf = LogisticRegression(C=optreg, random_state=self.seed)
                clf.fit(X_train, y_train)

            self.testresults.append(round(100 * clf.score(X_test, y_test), 2))

        devaccuracy = round(np.mean(self.devresults), 2)
        testaccuracy = round(np.mean(self.testresults), 2)
        return devaccuracy, testaccuracy
Exemplo n.º 2
0
    def run(self):
        logging.info('Training {0} with standard validation..'.format(
            self.modelname))
        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
               [2**t for t in range(-2, 4, 1)]
        if self.noreg:
            regs = [0.]
        scores = []
        for reg in regs:
            if self.usepytorch:
                if self.classifier == 'LogReg':
                    clf = LogReg(inputdim=self.featdim,
                                 nclasses=self.nclasses,
                                 l2reg=reg,
                                 seed=self.seed,
                                 cudaEfficient=self.cudaEfficient)
                elif self.classifier == 'MLP':
                    clf = MLP(inputdim=self.featdim,
                              hiddendim=self.nhid,
                              nclasses=self.nclasses,
                              l2reg=reg,
                              seed=self.seed,
                              cudaEfficient=self.cudaEfficient)
                # small hack : SNLI specific
                if self.nepoches:
                    clf.nepoches = self.nepoches
                if self.maxepoch:
                    clf.maxepoch = self.maxepoch
                clf.fit(self.X['train'],
                        self.y['train'],
                        validation_data=(self.X['valid'], self.y['valid']))
            else:
                clf = LogisticRegression(C=reg, random_state=self.seed)
                clf.fit(self.X['train'], self.y['train'])
            scores.append(
                round(100 * clf.score(self.X['valid'], self.y['valid']), 2))
        logging.info([('reg:' + str(regs[idx]), scores[idx])
                      for idx in range(len(scores))])
        optreg = regs[np.argmax(scores)]
        devaccuracy = np.max(scores)
        logging.info('Validation : best param found is reg = {0} with score \
            {1}'.format(optreg, devaccuracy))
        clf = LogisticRegression(C=optreg, random_state=self.seed)
        logging.info('Evaluating...')
        if self.usepytorch:
            if self.classifier == 'LogReg':
                clf = LogReg(inputdim=self.featdim,
                             nclasses=self.nclasses,
                             l2reg=optreg,
                             seed=self.seed,
                             cudaEfficient=self.cudaEfficient)
            elif self.classifier == 'MLP':
                clf = MLP(inputdim=self.featdim,
                          hiddendim=self.nhid,
                          nclasses=self.nclasses,
                          l2reg=optreg,
                          seed=self.seed,
                          cudaEfficient=self.cudaEfficient)
            # small hack : MultiNLI/SNLI specific
            if self.nepoches:
                clf.nepoches = self.nepoches
            if self.maxepoch:
                clf.maxepoch = self.maxepoch
            clf.fit(self.X['train'],
                    self.y['train'],
                    validation_data=(self.X['valid'], self.y['valid']))
        else:
            clf = LogisticRegression(C=optreg, random_state=self.seed)
            clf.fit(self.X['train'], self.y['train'])

        testaccuracy = clf.score(self.X['test'], self.y['test'])
        testaccuracy = round(100 * testaccuracy, 2)
        return devaccuracy, testaccuracy
Exemplo n.º 3
0
    def run(self):
        logging.info('Training {0} with standard validation..'.format(
            self.modelname))
        regs = [10**t for t in range(-5, 6)] if self.usepytorch else \
               [2**t for t in range(-2, 4, 1)]
        if self.noreg:
            regs = []  #[0.]
            optreg = 0.
        else:
            logging.debug("\tStarting to tune regularization")
        scores = []
        for reg in regs:
            if self.usepytorch:
                if self.classifier == 'LogReg':
                    clf = LogReg(inputdim=self.featdim,
                                 nclasses=self.nclasses,
                                 l2reg=reg,
                                 seed=self.seed,
                                 cudaEfficient=self.cudaEfficient,
                                 train_rank=self.train_rank)
                elif self.classifier == 'MLP':
                    clf = MLP(inputdim=self.featdim,
                              hiddendim=self.nhid,
                              nclasses=self.nclasses,
                              l2reg=reg,
                              seed=self.seed,
                              cudaEfficient=self.cudaEfficient,
                              train_rank=self.train_rank)
                # small hack : SNLI specific
                if self.nepoches:
                    clf.nepoches = self.nepoches
                if self.maxepoch:
                    clf.maxepoch = self.maxepoch
                clf.fit(self.X['train'],
                        self.y['train'],
                        validation_data=(self.X['valid'], self.y['valid']))
            else:
                clf = LogisticRegression(C=reg, random_state=self.seed)
                clf.fit(self.X['train'], self.y['train'])
            scores.append(
                round(100 * clf.score(self.X['valid'], self.y['valid']), 2))
        logging.info([('reg:' + str(regs[idx]), scores[idx])
                      for idx in range(len(scores))])
        if not self.noreg:
            optreg = regs[np.argmax(scores)]
            devaccuracy = np.max(scores)
            logging.info(
                'Validation : best param found is reg = {0} with score \
                {1}'.format(optreg, devaccuracy))
        #clf = LogisticRegression(C=optreg, random_state=self.seed) #??
        logging.info('Evaluating...')
        if self.usepytorch:
            if self.classifier == 'LogReg':
                logging.debug("\tCreating log reg classifier")
                clf = LogReg(inputdim=self.featdim,
                             nclasses=self.nclasses,
                             l2reg=optreg,
                             seed=self.seed,
                             cudaEfficient=self.cudaEfficient,
                             train_rank=self.train_rank)
            elif self.classifier == 'MLP':
                logging.debug("\Creating MLP classifier")
                clf = MLP(inputdim=self.featdim,
                          hiddendim=self.nhid,
                          nclasses=self.nclasses,
                          l2reg=optreg,
                          seed=self.seed,
                          cudaEfficient=self.cudaEfficient,
                          train_rank=self.train_rank)
            # small hack : MultiNLI/SNLI specific
            if self.nepoches:
                clf.nepoches = self.nepoches
            if self.maxepoch:
                clf.maxepoch = self.maxepoch
            clf.fit(self.X['train'],
                    self.y['train'],
                    validation_data=(self.X['valid'], self.y['valid']))
        else:
            clf = LogisticRegression(C=optreg, random_state=self.seed)
            clf.fit(self.X['train'], self.y['train'])
        logging.debug("\tFinished training!")

        if self.noreg:
            devaccuracy = round(
                100 * clf.score(self.X['valid'], self.y['valid']), 2)

        testaccuracy = clf.score(self.X['test'], self.y['test'])
        testaccuracy = round(100 * testaccuracy, 2)
        devprobs = clf.predict_proba(self.X['valid'])
        testprobs = clf.predict_proba(self.X['test'])
        return devaccuracy, testaccuracy, devprobs, testprobs
Exemplo n.º 4
0
    def run(self):
        # cross-validation
        logging.info('Training {0} with {1}-fold cross-validation'.format(
            self.modelname, self.k))
        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
               [2**t for t in range(-1, 6, 1)]
        skf = StratifiedKFold(n_splits=self.k,
                              shuffle=True,
                              random_state=self.seed)
        scores = []

        for reg in regs:
            scanscores = []
            for train_idx, test_idx in skf.split(self.train['X'],
                                                 self.train['y']):
                # Split data
                X_train, y_train = self.train['X'][train_idx], \
                                   self.train['y'][train_idx]

                X_test, y_test = self.train['X'][test_idx], \
                                 self.train['y'][test_idx]

                # Train classifier
                if self.usepytorch:
                    if self.classifier == 'LogReg':
                        clf = LogReg(inputdim=self.featdim,
                                     nclasses=self.nclasses,
                                     l2reg=reg,
                                     seed=self.seed)
                    elif self.classifier == 'MLP':
                        clf = MLP(inputdim=self.featdim,
                                  hiddendim=self.nhid,
                                  nclasses=self.nclasses,
                                  l2reg=reg,
                                  seed=self.seed)
                    clf.fit(X_train, y_train, validation_data=(X_test, y_test))
                else:
                    clf = LogisticRegression(C=reg, random_state=self.seed)
                    clf.fit(X_train, y_train)
                score = clf.score(X_test, y_test)
                scanscores.append(score)
            # Append mean score
            scores.append(round(100 * np.mean(scanscores), 2))

        # evaluation
        logging.info([('reg:' + str(regs[idx]), scores[idx])
                      for idx in range(len(scores))])
        optreg = regs[np.argmax(scores)]
        devaccuracy = np.max(scores)
        logging.info('Cross-validation : best param found is reg = {0} \
            with score {1}'.format(optreg, devaccuracy))

        logging.info('Evaluating...')
        if self.usepytorch:
            if self.classifier == 'LogReg':
                clf = LogReg(inputdim=self.featdim,
                             nclasses=self.nclasses,
                             l2reg=optreg,
                             seed=self.seed)
            elif self.classifier == 'MLP':
                clf = MLP(inputdim=self.featdim,
                          hiddendim=self.nhid,
                          nclasses=self.nclasses,
                          l2reg=optreg,
                          seed=self.seed)
            clf.fit(self.train['X'], self.train['y'], validation_split=0.05)
        else:
            clf = LogisticRegression(C=optreg, random_state=self.seed)
            clf.fit(self.train['X'], self.train['y'])
        yhat = clf.predict(self.test['X'])

        testaccuracy = clf.score(self.test['X'], self.test['y'])
        testaccuracy = round(100 * testaccuracy, 2)

        return devaccuracy, testaccuracy, yhat
Exemplo n.º 5
0
    def run(self):
        # similar to split classifier, this method is the MAIN method
        # will be called by outside to get task dev/test accuracy
        logging.info('Training {0} with standard validation..'
                     .format(self.modelname))
        regs = [10 ** t for t in range(-5, -1)] if self.usepytorch else \
              [2 ** t for t in range(-2, 4, 1)]
        if self.noreg:
            regs = [0.]
        scores = []
        for reg in regs:
            logging.info("Searching reg {}".format(reg))
            if self.usepytorch:
                if self.classifier == 'LogReg':
                    self.clf = LogReg(inputdim=self.featdim, nclasses=self.nclasses,
                                 l2reg=reg, seed=self.seed,
                                 cudaEfficient=self.cudaEfficient,
                                  batch_size=32)
                elif self.classifier == 'MLP':
                    self.clf = FCNet(inputdim=self.featdim, hiddendim=self.nhid,
                              nclasses=self.nclasses, l2reg=reg,
                              seed=self.seed, cudaEfficient=self.cudaEfficient,
                              batch_size=32)

                # this will actually encompass parameters from encoder and clf
                # an optimizer for each model

                # this is a possible point of failure, watch out!
                self.optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.clf.model.parameters()),
                                            weight_decay=self.clf.l2reg)
                self.fit()
            else:
                raise Exception("Must use PyTorch")
            cur_dev_acc = self.score()
            logging.info("Epoch {} dev accuracy {}".format(self.nepoch,cur_dev_acc))
            scores.append(round(100 * cur_dev_acc, 2))
        logging.info([('reg:' + str(regs[idx]), scores[idx])
                      for idx in range(len(scores))])
        optreg = regs[np.argmax(scores)]
        devaccuracy = np.max(scores)
        logging.info('Validation : best param found is reg = {0} with score \
                    {1}'.format(optreg, devaccuracy))

        logging.info('Evaluating...')
        # retrain with best hyper-param
        if self.usepytorch and not self.noreg:
            if self.classifier == 'LogReg':
                self.clf = LogReg(inputdim=self.featdim, nclasses=self.nclasses,
                             l2reg=optreg, seed=self.seed,
                             cudaEfficient=self.cudaEfficient, batch_size=32)
            elif self.classifier == 'MLP':
                self.clf = FCNet(inputdim=self.featdim, hiddendim=self.nhid,
                          nclasses=self.nclasses, l2reg=optreg, seed=self.seed,
                          cudaEfficient=self.cudaEfficient, batch_size=32)
            # small hack : MultiNLI/SNLI specific
            if self.nepoches:
                self.clf.nepoches = self.nepoches
            if self.maxepoch:
                self.clf.maxepoch = self.maxepoch
            self.fit()

        testaccuracy = self.score(test=True)
        testaccuracy = round(100 * testaccuracy, 2)

        return devaccuracy, testaccuracy