예제 #1
0
def main():
	# catch parameters
	training_set_features = sys.argv[1]
	training_set_classes = training_set_features.replace('features', 'classes')
	forest_file = sys.argv[2]

        # loading training features
        with open(training_set_features, 'r') as f:
            training_feature_vector = numpy.load(f)
	    if 1 == training_feature_vector.ndim:
		training_feature_vector = numpy.expand_dims(training_feature_vector, -1)
        with open(training_set_classes , 'r') as f:
            training_class_vector = numpy.load(f)
	

        # prepare and train the decision forest
        forest = ExtraTreesClassifier(n_estimators=200,
                            criterion = 'entropy',
                            max_features = None,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
			    max_depth = 500,
                            bootstrap = True,
                            oob_score = False,
                            random_state=0,
                            n_jobs=n_jobs,
                            compute_importances=True)
        forest.fit(training_feature_vector, training_class_vector)

	# saving the decision forest
	with open(forest_file, 'wb') as f:
		pickle.dump(forest, f)
예제 #2
0
class ExtraTreesClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
예제 #3
0
def select_features(X,y,X_test,n_features=100):
    '''
    select the top n_features
    '''
    forest = ExtraTreesClassifier(n_estimators=100,random_state=571)
    forest.fit(X,y)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    X = X[:,indices[0:n_features]]
    X_test = X_test[:,indices[0:n_features]]
    return X,X_test
예제 #4
0
def run_decision_tree_probabilistic_classification(train, train_labels, validate, validate_labels):
    # transform counts to TFIDF features
    tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
    train = tfidf.fit_transform(train).toarray()
    validate = tfidf.transform(validate).toarray()

    # encode labels
    label_encode = preprocessing.LabelEncoder()
    train_labels = label_encode.fit_transform(train_labels)

    decisionTree = ExtraTreesClassifier(n_jobs=4, n_estimators=1000, max_features=20, min_samples_split=3,
                                        bootstrap=False, verbose=3, random_state=23)
    decisionTree.fit(train, train_labels)
    predicted_labels = decisionTree.predict_proba(validate)
    print "Extra Trees Classifier LogLoss"
    print str(metrics.log_loss(validate_labels, predicted_labels))
def build_sample(regressor, name):
	# feature selection
	sample_X.shape
	clf = ExtraTreesClassifier()
	clf = clf.fit(sample_X, sample_y)
	print clf.feature_importances_  
	model = SelectFromModel(clf, prefit=True)
	X_new = model.transform(sample_X)
	X_new.shape 
	X_new.columns             
	# repeat the CV procedure 10 times to get more precise results
	n = 10  
	# for each iteration, randomly hold out 10% of the data as CV set
	for i in range(n):
		X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
		      sample_X[:, features], sample_y, test_size=.10, random_state=i*SEED)
		# train...
		regressor = regressor.fit(X_train, y_train)
		# save model
		#store_pkl(regressor, name + ".pkl")
		# predict on train
		preds = regressor.predict(X_cv)
		# print 
		#print preds
		# create DataFrame
		#preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"])
		#print preds
		#print y_cv
		# mape
		mape_r = mape(y_cv, preds)
		# print
		print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r)
	# predict on test
	predict_res = regressor.predict(sample_t[:, features])
	preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"])
	preds_on_test['ID'].astype(int)
	# save predictions
	store_csv(preds_on_test, name + ".csv")
	return predict_res
def classify(X,y,cv):
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = AdaBoostClassifier()
    clf = ExtraTreesClassifier()
    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)
    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
#print ",dfeatures[features][:-1]\n",dfeatures[features][:-1]
pd.set_option('display.max_columns', None)
print "ALL columns of dfeatures[features]"
print dfeatures[features].head(1)

# create a test and training set
x_train, x_test, y_train, y_test = train_test_split(
    dfeatures[features],
    dfeatures.author_num.values,
    test_size=0.4,
    random_state=123)
x, y = dfeatures[features], dfeatures.author_num.values

# CLASSIFIER
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

scores = cross_val_score(etclf, x, y)
print scores.mean()

# Print Confusion Matrix
print metrics.confusion_matrix(etclf.predict(x_test), y_test)
# print authors
"""
# # PREVIOUS RESULT 0.671469386087

############# RESULT WITH ALL FEATURES ############
/Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
0.148101533384
[[0 0 0 ..., 0 0 0]
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing

authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv")
authors = list(set(authorship.Author.values))
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship["Author_num"] = le.transform(authorship["Author"])

# What are some of the stop words we're looking at?
features = list(authorship.columns)
features
features.remove("Author")
features.remove("Author_num")

# Create a random variable (random forests work best with a random variable)
# and create a test and training set
authorship["random"] = [random.random() for i in range(841)]
x_train, x_test, y_train, y_test = train_test_split(
    authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123
)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)
예제 #9
0
파일: models.py 프로젝트: kalpanki/pp
def etree_classify(X,Y):
	clf = ExtraTreesClassifier(n_estimators=500, max_depth=10, criterion='gini',min_samples_split=2, \
			min_samples_leaf=1, max_features=None, bootstrap=False, oob_score=False, n_jobs=-1)
	clf.fit(X,Y)
	return clf
예제 #10
0
# also tested this:
# svm.SVC(kernel='linear', C=1.0), GaussianNB()
# doesn't improve and takes long

#running crossvalidation score on all classifiers
for clf in classifiers:
    score = cross_val_score(clf, X, y, cv=cv)
    print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2)

#now let's go to OOS test
testX = test[['Sex01','Fare','SibSp','Parch','Pclass']]
medianFare = testX.Fare.median()
testX.Fare = testX.Fare.fillna(medianFare)

#print results to CSV files for Kaggle submission
clf = ExtraTreesClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False)

clf = RandomForestClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False)

clf = DecisionTreeClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)

예제 #11
0
class BestClassifiers(object):
    '''
    SVM models of different patterns. It loads model and enable prediction with new data.
    '''
    def __init__(self, patternEnum=PatternEnum.EVENTUALLY):
        '''
        Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before.
        '''
        self.patternEnum = patternEnum
        self.pattern = Pattern(patternEnum)
        modelFile = str(
            patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl"
        self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile
        self.preProcessMethod = "NONE"
        if (patternEnum == PatternEnum.EVENTUALLY):
            self.maxRandState = 196558
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
                decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
                max_iter=-1, probability=False, random_state=None, shrinking=True,
                tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.ALWAYS):
            self.maxRandState = 124255
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.FOLLOWS):
            self.maxRandState = 196588
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.PRECEDES):
            self.maxRandState = 187708
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.NEVER):
            self.maxRandState = 182526
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                max_depth=None, max_features=None, max_leaf_nodes=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.STEADY_STATE):
            self.maxRandState = 119746
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.UNTIL):
            self.maxRandState = 114007
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.INFINITELY_OFTEN):
            self.maxRandState = 150000
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features='log2', max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.NEXT):
            self.maxRandState = 173977
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.RELEASE):
            self.maxRandState = 105454
            # random seed to shuffle data for training
            self.preProcessMethod = "SCALE"
            self.clf = \
            SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.WEAK_UNTIL):
            self.maxRandState = 163090
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)

    def getModel(self):
        "Get the classifier, and the unique class labels (the class names)"
        try:
            #             print("self.modelFile",self.modelFile)
            clf, preprocessor = pickle.load(open(self.modelFile, "rb"))
            #             print("Classifier found. It is loading.")
            return clf, preprocessor
        except (OSError,
                IOError):  # Model does not exist, first train then save it
            #             print("Classifier not found. New classifier is training.")
            X, preprocessor = processData(self.pattern.feature,
                                          self.preProcessMethod)
            # shuffle data
            shuffled_X, shuffled_y = shuffle(X,
                                             self.pattern.y,
                                             random_state=self.maxRandState)
            self.clf.fit(shuffled_X, shuffled_y)
            # save the model
            pickle.dump((self.clf, preprocessor), open(self.modelFile, "wb"))
            return self.clf, preprocessor
        except Exception as e:
            print(e)

    def predict(self, properties):
        clf, preprocessor = self.getModel()
        if preprocessor:
            properties = preprocessor.transform(
                properties
            )  # apply the pre-processing method done for training date
        targetMC = clf.predict(properties)
        return targetMC
        tp += 1
    elif preds[a] == 1:
      fp += 1
    elif preds[a] == 0:
      fn += 1

  print 'correct positives:', tp
  print 'correct negatives:', tn
  print 'false positives:', fp
  print 'false negatives:', fn

  extra_trees = ExtraTreesClassifier()
  extra_score = cross_val_score(extra_trees, X, y, cv=i)
  print '\nextra trees %s-fold cross validation accuracy: %s' % (i, sum(extra_score)/extra_score.shape[0])

  extra_fit = extra_trees.fit(X, y)
  print 'Feature Importances %s' % (extra_fit.feature_importances_)
  for f in extra_fit.feature_importances_:
    print '{}: {}'.format(next(features), f)

  X_for_preds = extra_fit.transform(X, threshold=min(extra_fit.feature_importances_))
  preds = extra_fit.predict(X_for_preds)
  print 'predictions counter %s' % (Counter(extra_fit.predict(X_for_preds)))
  fp = 0
  tp = 0
  fn = 0
  tn = 0
  for a in range(len(y)):
    if y[a] == preds[a]:
      if preds[a] == 0:
        tn += 1
예제 #13
0
# Apply Some Featuring
poly_reg = PolynomialFeatures(degree=1)

# Transform into numpy object
x_train = poly_reg.fit_transform(X_train)
X_test = poly_reg.fit_transform(X_test)
y_test = np.array(y_test.ix[:,0])
y_train = np.array(y_train.ix[:,0])

# Build model with good params
model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.6, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Scoring
if regression:
    print('Score on test set:', mean_absolute_error(y_test, y_pred))
else:
    print('Score on test set:', accuracy_score(y_test, y_pred))