Exemplo n.º 1
0
    def __init__(self, testData, model, ytrue=[]):
        N=10
        print model.rng
        data = ExtraFeatures()
        data.fit_transform(testData.X)
        self.predictions=[]
        self.mainModelPredictions=self.evaluateMainModel(data, model)
        #self.invIdxModelPredictions=self.lookupIdx(testData, model, N)
        #self.distanceBasedClassifier=self.distanceBasedClassifier(data,model,0)

        for i, vector in enumerate(data.features):
            label = -1
            k=0 #To track which classifier was used in prediction
            if self.mainModelPredictions[i] >= 1:
                label = 1
                k = 5
            #elif data[0]>0 and len(data[-1]) <= 3:
            #	for word in data[-1]:
            #		if word in model.plist:
            #			label = 1
            #			k=1
            #			break
            #elif len(data[-1]) == 1 and data[-1][0] in model.plist:
            #	k=2
            #	label = 1
            #elif vector[2]>0:
            #    k=3
            #    label=1
            #elif self.invIdxModelPredictions[i]==1:
            #	label= 1
            #	k=4

            self.predictions.append(label)
Exemplo n.º 2
0
    def __init__(self, trainingData):

        comments = trainingData.X
        yTrain = trainingData.y
        # print yTrain

        # this model is no longer useful
        self.invIdx = {}

        data = ExtraFeatures()
        data.fit_transform(comments, yTrain)
        lf = np.asmatrix(np.asarray(data.features))
        print lf[0]
        mins = np.min(lf, axis=0)
        maxs = np.max(lf, axis=0)
        self.rng = copy.deepcopy(maxs - mins)
        print self.rng
        print self.rng.shape
        lf = lf / self.rng
        # print lf

        self.WordFeatures = TfidfVectorizer(
            ngram_range=(1, 3), smooth_idf=True, max_features=2500, stop_words=["you you"]
        )

        self.WordFeatures.fit_transform(data.new_documents)
        wf = self.WordFeatures.transform(data.new_documents)

        # XTrain = lf
        XTrain = scipy.sparse.coo_matrix(np.concatenate((wf.todense(), lf), axis=1))
        # XTrain = wf

        # Scaling seems to worsen the performance

        # Smaller C implies stronger regularization
        # mainModel = RandomForestClassifier(n_estimators=10)
        mainModel = LogisticRegression(penalty="l1")
        wModel = LogisticRegression(penalty="l1")
        lModel = LogisticRegression(penalty="l2")
        # mainModel = svm.LinearSVC(penalty='l1', loss='l2', dual=False,
        #        tol=0.0001, fit_intercept=True, random_state=1)

        score_func = metrics.roc_auc_score
        cv = StratifiedShuffleSplit(yTrain, n_iter=5, test_size=0.20)
        param_grid = {"C": [0.1, 0.25, 0.5, 1], "class_weight": [{-1: 1, 1: 2}, {-1: 1, 1: 2.25}, {-1: 1, 1: 1.5}]}

        # bs = Bootstrap(nsamples, n_iter=25, test_size=0.20, random_state=1)
        self.classifier = GridSearchCV(
            mainModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1
        )

        self.classifier.fit(XTrain, yTrain)
        print "Best %s: %0.3f" % (score_func.__name__, self.classifier.best_score_)
        print "Best parameters set:"
        best_parameters = self.classifier.best_estimator_.get_params()
        for param_name in param_grid.keys():
            print "\t%s: %r" % (param_name, best_parameters[param_name])

        #        newY=[]
        #        ypred = self.classifier.predict(XTrain)
        #        print ypred
        #        for i in xrange(len(yTrain)-1, 0, -1):
        #            if ypred[i] != yTrain[i]:
        #                newY.append(yTrain[i])
        #            else:
        #                np.delete(lf, i)
        #
        #        newY = np.asarray(newY.reverse())
        #        print lf.shape, newY.shape
        #
        #        self.classifier2 = GridSearchCV(mainModel, param_grid, loss_func=None,
        #                scoring='roc_auc', n_jobs=3, refit=True, cv=cv, verbose=1)
        #
        #        self.classifier2.fit(lf, newY)
        #        print "Best %s: %0.3f" % (score_func.__name__, self.classifier2.best_score_)
        #        print "Best parameters set:"
        #        best_parameters = self.classifier2.best_estimator_.get_params()
        #        for param_name in param_grid.keys():
        #            print "\t%s: %r" % (param_name, best_parameters[param_name])

        self.wclassifier = GridSearchCV(
            wModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1
        )

        self.wclassifier.fit(wf, yTrain)
        print "Best %s: %0.3f" % (score_func.__name__, self.wclassifier.best_score_)
        print "Best parameters set:"
        best_parameters = self.wclassifier.best_estimator_.get_params()
        for param_name in param_grid.keys():
            print "\t%s: %r" % (param_name, best_parameters[param_name])

        self.lclassifier = GridSearchCV(
            lModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1
        )

        self.lclassifier.fit(lf, yTrain)
        print "Best %s: %0.3f" % (score_func.__name__, self.lclassifier.best_score_)
        print "Best parameters set:"
        best_parameters = self.lclassifier.best_estimator_.get_params()
        for param_name in param_grid.keys():
            print "\t%s: %r" % (param_name, best_parameters[param_name])