def __init__(self, testData, model, ytrue=[]): N=10 print model.rng data = ExtraFeatures() data.fit_transform(testData.X) self.predictions=[] self.mainModelPredictions=self.evaluateMainModel(data, model) #self.invIdxModelPredictions=self.lookupIdx(testData, model, N) #self.distanceBasedClassifier=self.distanceBasedClassifier(data,model,0) for i, vector in enumerate(data.features): label = -1 k=0 #To track which classifier was used in prediction if self.mainModelPredictions[i] >= 1: label = 1 k = 5 #elif data[0]>0 and len(data[-1]) <= 3: # for word in data[-1]: # if word in model.plist: # label = 1 # k=1 # break #elif len(data[-1]) == 1 and data[-1][0] in model.plist: # k=2 # label = 1 #elif vector[2]>0: # k=3 # label=1 #elif self.invIdxModelPredictions[i]==1: # label= 1 # k=4 self.predictions.append(label)
def __init__(self, trainingData): comments = trainingData.X yTrain = trainingData.y # print yTrain # this model is no longer useful self.invIdx = {} data = ExtraFeatures() data.fit_transform(comments, yTrain) lf = np.asmatrix(np.asarray(data.features)) print lf[0] mins = np.min(lf, axis=0) maxs = np.max(lf, axis=0) self.rng = copy.deepcopy(maxs - mins) print self.rng print self.rng.shape lf = lf / self.rng # print lf self.WordFeatures = TfidfVectorizer( ngram_range=(1, 3), smooth_idf=True, max_features=2500, stop_words=["you you"] ) self.WordFeatures.fit_transform(data.new_documents) wf = self.WordFeatures.transform(data.new_documents) # XTrain = lf XTrain = scipy.sparse.coo_matrix(np.concatenate((wf.todense(), lf), axis=1)) # XTrain = wf # Scaling seems to worsen the performance # Smaller C implies stronger regularization # mainModel = RandomForestClassifier(n_estimators=10) mainModel = LogisticRegression(penalty="l1") wModel = LogisticRegression(penalty="l1") lModel = LogisticRegression(penalty="l2") # mainModel = svm.LinearSVC(penalty='l1', loss='l2', dual=False, # tol=0.0001, fit_intercept=True, random_state=1) score_func = metrics.roc_auc_score cv = StratifiedShuffleSplit(yTrain, n_iter=5, test_size=0.20) param_grid = {"C": [0.1, 0.25, 0.5, 1], "class_weight": [{-1: 1, 1: 2}, {-1: 1, 1: 2.25}, {-1: 1, 1: 1.5}]} # bs = Bootstrap(nsamples, n_iter=25, test_size=0.20, random_state=1) self.classifier = GridSearchCV( mainModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1 ) self.classifier.fit(XTrain, yTrain) print "Best %s: %0.3f" % (score_func.__name__, self.classifier.best_score_) print "Best parameters set:" best_parameters = self.classifier.best_estimator_.get_params() for param_name in param_grid.keys(): print "\t%s: %r" % (param_name, best_parameters[param_name]) # newY=[] # ypred = self.classifier.predict(XTrain) # print ypred # for i in xrange(len(yTrain)-1, 0, -1): # if ypred[i] != yTrain[i]: # newY.append(yTrain[i]) # else: # np.delete(lf, i) # # newY = np.asarray(newY.reverse()) # print lf.shape, newY.shape # # self.classifier2 = GridSearchCV(mainModel, param_grid, loss_func=None, # scoring='roc_auc', n_jobs=3, refit=True, cv=cv, verbose=1) # # self.classifier2.fit(lf, newY) # print "Best %s: %0.3f" % (score_func.__name__, self.classifier2.best_score_) # print "Best parameters set:" # best_parameters = self.classifier2.best_estimator_.get_params() # for param_name in param_grid.keys(): # print "\t%s: %r" % (param_name, best_parameters[param_name]) self.wclassifier = GridSearchCV( wModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1 ) self.wclassifier.fit(wf, yTrain) print "Best %s: %0.3f" % (score_func.__name__, self.wclassifier.best_score_) print "Best parameters set:" best_parameters = self.wclassifier.best_estimator_.get_params() for param_name in param_grid.keys(): print "\t%s: %r" % (param_name, best_parameters[param_name]) self.lclassifier = GridSearchCV( lModel, param_grid, loss_func=None, scoring="roc_auc", n_jobs=3, refit=True, cv=cv, verbose=1 ) self.lclassifier.fit(lf, yTrain) print "Best %s: %0.3f" % (score_func.__name__, self.lclassifier.best_score_) print "Best parameters set:" best_parameters = self.lclassifier.best_estimator_.get_params() for param_name in param_grid.keys(): print "\t%s: %r" % (param_name, best_parameters[param_name])