예제 #1
0
titanic['Sex'] = titanic['Sex'].apply(lambda s: 0 if s == 'male' else 1)
print(titanic)

x = titanic.iloc[:, [1, 3, 4]]
y = titanic.iloc[:, 0]

# 학습용 / 검정용 자료 분리.
titanic_x, test_x, titanic_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
# print(titanic_x.head())
# print(titanic_y.head())

ml = RandomForestClassifier(criterion='entropy').fit(
    titanic_x, titanic_y)  # (criterion='entropy', random_state = 0)
print(ml)

titanic_pred_y = ml.predict(test_x)
print('실제값 : ', test_y)
print('추정값 : ', titanic_pred_y)

print('총 검정 수 %d, 오류 수 %d' % (len(test_y), (test_y != titanic_pred_y).sum()))
print('분류 정확도 : %.3f' % accuracy_score(test_y, titanic_pred_y))

# 교차검증
print('교차검증 : \n',
      model_selection.cross_val_score(ml, titanic_x, titanic_y, cv=5))

# 새로운 데이터로 예측
new_data = np.array([[1, 0, 24], [1, 1, 43], [2, 1, 48], [3, 0, 33],
예제 #2
0
x = dataset.iloc[:, 2:-1].values
y = dataset.iloc[:, -1].values

# Splitting into TestSet and Training Set
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
xTrain = scX.fit_transform(xTrain)
xTest = scX.transform(xTest)

# Fitting Classifier on Training Set
from sklearn.ensemble.forest import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(xTrain, yTrain)

# Predict Test Set Results
yPred = classifier.predict(xTest)

# Make Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(yTest, yPred)

# Visualising Training Set Results
from matplotlib.colors import ListedColormap
xSet, ySet = xTrain, yTrain
X1, X2 = np.meshgrid(np.arange(start = xSet[:, 0].min() - 1, stop = xSet[:, 0].max() + 1, step = 0.01), 
                     np.arange(start = xSet[:, 1].min() - 1, stop = xSet[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
예제 #3
0
    def RF_trainandtest(self,
                        testsize,
                        cv,
                        feature_sel,
                        varthreshold,
                        ntrees,
                        nodes,
                        rfmethod,
                        nclusters=10,
                        cmethod=None):

        #分割数据集为训练集和测试集
        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']
        X_train, X_test, y_train, y_test = train_test_split(data_feature,
                                                            data_target,
                                                            test_size=testsize,
                                                            random_state=0)

        #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
        X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test,
                                                   nclusters, cmethod)

        #在train中做变量筛选, sklearn.feature_selection中的方法
        if feature_sel == "VarianceThreshold":
            selector = VarianceThreshold(threshold=varthreshold)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "RFECV":
            estimator = LogisticRegression()
            selector = RFECV(estimator, step=1, cv=cv)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectFromModel":
            estimator = LogisticRegression()
            selector = SelectFromModel(estimator)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectKBest":
            selector = SelectKBest()
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        else:
            X_train1, X_test1 = X_train, X_test

        #训练并预测随机森林模型
        if rfmethod == 'RandomForest':
            classifier = RandomForestClassifier(n_estimators=ntrees,
                                                min_samples_split=nodes * 2,
                                                min_samples_leaf=nodes)
        elif rfmethod == 'ExtraTrees':
            classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                              min_samples_split=nodes * 2,
                                              min_samples_leaf=nodes)
        elif rfmethod == 'GradientBoosting':
            classifier = GradientBoostingClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)

        classifier.fit(X_train1, y_train)
        probability = classifier.predict_proba(X_test1)[:, 1]

        predresult = pd.DataFrame({
            'target': y_test,
            'probability': probability
        })

        return predresult
예제 #4
0
    def runns(resp_var, size_of_test_data, dataset, positive_class,
              predictor_var, n_estimators, important_features,
              dealing_with_nulls):
        dataset = pd.read_csv('raw_data.csv',
                              low_memory=False)  # For testing purposes
        #----DATA PREPROCESSING
        #-------dealing with NULL values in the data
        #----------remove the rows in which the response is null

        dataset = dataset.dropna(subset=[resp_var])
        #----------dealing with nulls
        dataset = deal_with_nulls(dealing_with_nulls, dataset)
        #----FEATURE SELECTION
        #-------get predictors important in predicting the response
        #-----------transform categorical predictors to dummy variables
        predictors = dataset[predictor_var]
        predictors = pd.get_dummies(predictors)
        #-----------balance the classes in the response var
        ros = RandomOverSampler(random_state=0)
        resp = dataset[resp_var]
        prds, resp = ros.fit_sample(predictors, resp)
        #-----------fit the random forest classifier to give us the important predictors
        rf_clf = RandomForestClassifier(n_estimators=n_estimators)
        rf_clf.fit(prds, resp)
        #-------get the important predictors
        feature_imp = pd.Series(
            rf_clf.feature_importances_,
            index=list(predictors.iloc[:, 0:])).sort_values(ascending=False)
        #-------names of the important predictors
        important_predictor_names = feature_imp.index[0:important_features]
        #-------subset the data to get only the important predictors and the response
        resp = pd.DataFrame(data=resp, columns=[resp_var])
        predictors = pd.DataFrame(prds, columns=list(predictors))
        dataset = pd.concat([resp, predictors], axis=1)
        #---------------------------------------------------------
        #----MODEL TRAINING
        #--------Remove the response variables from the features variables - axis 1 refers to the columns
        m_data = dataset.drop(resp_var, axis=1, inplace=False)
        # Response variables are the values we want to predict
        resp_var = np.array(dataset[resp_var])

        dataset = pd.get_dummies(m_data)

        # Saving feature names for later use
        feature_list = list(m_data.columns)
        # Convert to numpy array
        dataset = np.array(dataset)

        # Split the data into training and testing sets
        train_features, test_features, train_labels, test_labels = train_test_split(
            dataset,
            resp_var,
            test_size=float(size_of_test_data),
            random_state=402)

        # Instantiate model with n_estimators decision trees
        clf = RandomForestClassifier(n_jobs=1,
                                     n_estimators=n_estimators,
                                     random_state=142)

        # Train the model on training data
        clf.fit(train_features, train_labels)
        # evaluation
        predicted = clf.predict(test_features)
        pred_prob = clf.predict_proba(test_features)

        accuracy = accuracy_score(test_labels, predicted)
        #confusion matrix
        cnf = (confusion_matrix(test_labels, predicted))
        #precision score
        precision = precision_score(test_labels,
                                    predicted,
                                    pos_label=positive_class)
        #avg pres
        avg_precision = average_precision_score(test_labels, pred_prob[:, [1]])
        #recall score
        rec = recall_score(test_labels, predicted, pos_label=positive_class)
        #f1 scorea
        fscore = f1_score(test_labels, predicted, pos_label=positive_class)
        #fbeta score
        fbeta = fbeta_score(test_labels, predicted, beta=0.5)
        #hamming_loss
        hamming = hamming_loss(test_labels, predicted)
        #jaccard similarity score
        jaccard = jaccard_similarity_score(test_labels, predicted)
        #logloss
        logloss = log_loss(test_labels, predicted)
        #zero-oneloss
        zero_one = zero_one_loss(test_labels, predicted)
        #auc roc
        area_under_roc = roc_auc_score(test_labels, pred_prob[:, [1]])
        #cohen_score
        cohen = cohen_kappa_score(test_labels, predicted)
        #mathews corr
        mathews = matthews_corrcoef(test_labels, predicted)
        # Variable importances from the important features selection stage
        variable_importance_list = list(zip(prds, feature_imp))
        output = {
            "accuracy": accuracy,
            "precision": precision,
            "average precision": avg_precision,
            "recall": rec,
            "fscore": fscore,
            "fbeta": fbeta,
            "hamming": hamming,
            "jaccard": jaccard,
            "logloss": logloss,
            "zero_one": zero_one,
            "area_under_roc": area_under_roc,
            "cohen": cohen,
            "mathews": mathews
        }
        output = json.dumps(output)
        return output
예제 #5
0
train_8 = pd.read_csv("./future/train_all_split_8.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)
train_9 = pd.read_csv("./future/train_all_split_9.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)
train_10 = pd.read_csv("./future/train_all_split_10.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)
train_11 = pd.read_csv("./future/train_all_split_11.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)
train_12 = pd.read_csv("./future/train_all_split_12.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)


test_2 = pd.read_csv("./future/test_v2_all_future.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0)

results_df = pd.DataFrame(index=test_2.index)
resultCol = pd.Series(index=test_2.index,dtype=object)

#try random forest on feature A
n_estimators = 100

rfcModel = RandomForestClassifier(n_estimators=n_estimators)

lr = LogisticRegression()

#svr =  SVR(C=1.0, epsilon=0.2)
#C = 10.0  # SVM regularization parameter
#svc = svm.SVC(kernel='linear', C=C)
#rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)
#poly_svc = svm.SVC(kernel='poly', degree=3, C=C)

def filter_features(train_df, model, prodStr='A'):

    input_df = train_df.copy(deep=True)
    #input_df = cleanup(input_df)
    y = input_df[prodStr].values
    input_df=input_df.drop([prodStr], axis=1)
예제 #6
0
from uci_comparison import compare_estimators
from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier
from rr_forest import RRForestClassifier
from rr_extra_forest import RRExtraTreesClassifier

estimators = {
    'RandomForest': RandomForestClassifier(n_estimators=20),
    'RndRotForest': RRForestClassifier(n_estimators=20),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=20),
    'RndRotETrees': RRExtraTreesClassifier(n_estimators=20),
}

# optionally, pass a list of UCI dataset identifiers as the datasets parameter, e.g. datasets=['iris', 'diabetes']
# optionally, pass a dict of scoring functions as the metric parameter, e.g. metrics={'F1-score': f1_score}
compare_estimators(estimators)
예제 #7
0
index = 16
cls.fit(xtrain, train_label)
d = xtest[index]
d.shape = (28, 28)
plt.imshow(d, cmap='gray')
plt.show()
print(cls.predict([xtest[index]]))

pre = cls.predict(xtest)
count = 0
for i in range(0, 21000):
    count += 1 if pre[i] == test_label[i] else 0
print("Accuracy", (count / 21000) * 100)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(xtrain, train_label)

pre = rf.predict(xtest)
count = 0
for i in range(len(pre)):
    count += 1 if pre[i] == test_label[i] else 0
print("Accuracy", (count / len(pre)) * 100)

pre = rf.predict(test)
print(pre)
sub.head()
sub.Label = pre

sub.head()
sub.info()
예제 #8
0
X_path = feature + '_features.h5'

# Loading the cars dataset features
test_size = 0.3
cars_train_X, cars_test_X, cars_train_y, cars_test_y = split_dataset(
    X_path, feature, test_size)
cars_train_X = np.asarray(cars_train_X).reshape(
    cars_train_X.shape[0], np.prod(cars_train_X.shape[1:]))
cars_test_X = np.asarray(cars_test_X).reshape(cars_test_X.shape[0],
                                              np.prod(cars_test_X.shape[1:]))

#cls = RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1)
# classifier is random forest, if desired to use bagging with 5 subsets, remove the comments below
# the argument max_features=0.2 splits the features in 1/5 of the total features
cls = RandomForestClassifier(n_estimators=100,
                             verbose=True,
                             n_jobs=-1,
                             max_features=0.2)
cls.fit(cars_train_X, cars_train_y)

# if desired to show accuracy and std-dev values for the training set, uncomment the 3 lines below
# scores = cross_val_score(cls, cars_train_X, cars_train_y, cv=5, verbose=True)
# print(scores)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# save predictions for plotting the heatmap
y_pred = cls.predict(cars_test_X)

with open('bagging.sav', 'wb') as f:
    pkl.dump((y_pred, cars_test_y), f)

y_true = cars_test_y
예제 #9
0
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        # Reading the CSV file and converting it into a pandas data-frame
        df = pd.read_csv(path, encoding="ISO-8859-1")

        # Reading the name for the file for the model that will be saved
        filename = request.form['filename']

        # Reading the names of the feature and label as strings
        str1 = request.form['feature']
        str2 = request.form['label']

        # Assigning the feature and label variables to the respective columns
        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')
        '''
        # Removing the punctuations and HTTP links in the feature text input
        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)
        '''
        X = X.str.lower()

        # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """

        # Splitting the data-set into 2 parts : Training data and Test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            shuffle=True)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        # Fitting all the classification models one by one and recording their accuracies and execution times

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf, y_train)
        pred = clf11.predict(tfidfvect.transform(X_test))
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf, y_train)
        pred = clf12.predict(tfidfvect.transform(X_test))
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use.

        acu_list = [a1, a2, a3, a4, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac11=a11,
                               ac12=a12)
예제 #10
0
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# sklearn provides the iris species as integer values since this is required for classification
# here we're just adding a column with the species names to the dataframe for visualisation
df['species'] = np.array([iris.target_names[i] for i in iris.target])
# sns.pairplot(df, hue='species')
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(df[iris.feature_names],
                                                    iris.target,
                                                    test_size=0.5,
                                                    stratify=iris.target,
                                                    random_state=123456)
rf = RandomForestClassifier(
    n_estimators=100, oob_score=True,
    random_state=123456)  #Uses RandomForestClassifer with 100 trees
rf.fit(X_train, y_train)
predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

cm = pd.DataFrame(confusion_matrix(y_test, predicted),
                  columns=iris.target_names,
                  index=iris.target_names)
sns.heatmap(cm, annot=True)
plt.show()
예제 #11
0
    test_data['age'] = test_data['age'].apply(lambda x: age_disperse(x))
    test_data_disperse, label_encoders, onehot_encoder \
        = onehot_encoder_disperse(test_data[disperse_cols], disperse_cols, label_encoders, onehot_encoder)
    test_data = test_data.reset_index()
    test_data = pd.concat([test_data_disperse, test_data[continuous_cols]],
                          axis=1,
                          ignore_index=True)

    train_size = len(train_data)
    data = pd.concat([train_data, test_data])
    print(data.columns[104])
    data = standardized(data)

    # test_data = train_data
    random_forest = RandomForestClassifier(n_estimators=10,
                                           verbose=1,
                                           max_depth=10)
    random_forest.fit(X=data.iloc[:train_size, :-1],
                      y=train_data.iloc[:train_size, -1])
    prediction = random_forest.predict(data.iloc[train_size:, :-1])
    # mlp = MLPClassifier(hidden_layer_sizes=[20, 40, 20 ], verbose=1)
    # mlp.fit(X=train_data.iloc[:, :-1], y=train_data.iloc[:, -1])
    # prediction = mlp.predict(test_data.iloc[:, :-1])
    counter = 0
    for i in range(len(prediction)):
        y = data.iloc[train_size + i, -1]
        if prediction[i] == y:
            counter += 1
        # else:
        #     print('ground truth: {0}, prediction: {1}'.format(prediction[i], y))
    print('accuracy: {0}'.format(counter / len(prediction)))
print(featureScores.nlargest(20, 'Score'))  #print 10 best features

#2 extratree classifier
model = ExtraTreesClassifier()
model.fit(nyse_scaled, eps)
print(model.feature_importances_
      )  #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_,
                             index=nyse_scaled.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

#3 randomforest
rf_exp = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=100))
rf_exp.fit(nyse_scaled, eps)
rf_exp.get_support()
selected_feat = nyse_scaled.columns[(rf_exp.get_support())]
print(nyse_scaled.columns[rf_exp.get_support()])

nyse_scaled1 = nyse_scaled[[
    'Other Operating Items', 'Pre-Tax Margin', 'Profit Margin',
    'Operating Margin', 'Long-Term Investments', 'Non-Recurring Items',
    'Common Stocks', 'Depreciation', 'Accounts Payable', 'Cost of Revenue',
    'Total Current Liabilities', 'Total Revenue', 'Other Liabilities',
    'Net Receivables', 'Research and Development', 'Total Current Assets',
    'Inventory', 'Deferred Liability Charges', 'Total Liabilities & Equity',
    'Total Assets'
]]
eps1 = nyse.iloc[:, 75:76]
예제 #13
0
unique_test_pixels=pd.unique(test.values.ravel())
#print "Unique Test Values:", unique_test_pixels

#Check if pixels in test are a subset of pixels in train. If yes, easier to do predictions
#print "Test values in Train?", np.in1d(unique_test_pixels, unique_train_pixels, assume_unique=True)

#Check if there is linear correlation between pixel<x> columns and label
#If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data.
#In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher
#print "Correlation:", train.corr()["label"]

#Check that the algorithm used gives good accuracy by using part of the training set to validate
train_train, train_test=train_test_split(train, test_size=0.3)

#Train model
model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)
#model=KNeighborsClassifier(n_neighbors=6)


#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
#model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int)))
#print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int)))
#print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3)
model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel())
print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel())

예제 #14
0
                       index_col=0)
train_sets[12] = train_12

test_2 = pd.read_csv("./future/test_v2_all_future.csv",
                     header=0,
                     encoding="UTF-8",
                     error_bad_lines=False,
                     sep=",",
                     index_col=0)

results_df = pd.DataFrame(index=test_2.index)
resultCol = pd.Series(index=test_2.index, dtype=object)

n_estimators = 30

rfcModel = RandomForestClassifier(n_estimators=n_estimators, max_features=None)
rfrmodel = RandomForestRegressor(n_estimators=n_estimators)
lr = LogisticRegression()

#svr =  SVR(C=1.0, epsilon=0.2)
#C = 10.0  # SVM regularization parameter
#svc = svm.SVC(kernel='linear', C=C)
#rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)
#poly_svc = svm.SVC(kernel='poly', degree=3, C=C)


def cleanup(df):

    #find columns that contain null values
    inds = pd.isnull(df).any(0).nonzero()
예제 #15
0
def drawfeature(
        TRAIN_DATA_PATH='/home/samuelchan/PycharmProjects/emotion-analysis/train',
        train_filename='thetrain.csv',
        TEST_DATA_PATH='/home/samuelchan/PycharmProjects/emotion-analysis/test',
        test_filename='thetest.csv'):
    train_file = os.path.join(TRAIN_DATA_PATH, train_filename)
    train_data = pd.read_csv(train_file)
    n_data_train = train_data['text'].size

    test_file = os.path.join(TEST_DATA_PATH, test_filename)
    test_data = pd.read_csv(test_file)
    n_data_test = test_data['text'].size

    # # bag of words model + tfidf
    # vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
    # transformer = TfidfTransformer()

    # bigram + tf
    vectorizer = HashingVectorizer(ngram_range=(2, 2), non_negative=True)

    # train
    print 'Start cut word in train data set'
    train_data_word = []
    for i in xrange(n_data_train):
        if ((i + 1) % 1000 == 0):
            print 'Drawfeatures Line %d of %d' % (i + 1, n_data_train)
        train_data_word.append(word_to_feature(train_data['text'][i]))

    # print 'Start bag of word in train data set'
    # # draw features
    # train_data_features = vectorizer.fit_transform(train_data_word)
    # # train_data_features = train_data_features.toarray()
    # print 'Start tfidf in train data set'
    # train_data_features = transformer.fit_transform(train_data_features)
    # # train_data_features = train_data_features.toarray()

    print 'Start bigram model in train data set'
    train_data_features = vectorizer.fit_transform(train_data_word)

    # test
    print 'Start cut words in test data set'
    test_data_words = []
    for i in xrange(n_data_test):
        if ((i + 1) % 1000 == 0):
            print 'Drawfeatures Line %d of %d' % (i + 1, n_data_test)
        test_data_words.append(word_to_feature(test_data['text'][i]))

    # # draw feature
    # print 'Start bag of word in test data set'
    # test_data_features = vectorizer.fit_transform(test_data_words)
    # # test_data_features = test_data_features.toarray()
    # print 'Start tfidf in test data set'
    # test_data_features = transformer.fit_transform(test_data_features)
    # # test_data_features = test_data_features.toarray()

    print 'Start bigram model in test data set'
    test_data_features = vectorizer.fit_transform(test_data_words)

    # random forest
    print 'random forest'
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit(train_data_features, train_data['label'])
    pred = forest.predict(test_data_features)
    pred = pd.Series(pred, name='TARGET')
    # pred.to_csv('BOW_TFIDF_RF5.csv', index=None, header=True)
    pred.to_csv('BI_W2V_RF1.csv', index=None, header=True)

    # multinomial navive bayes
    print 'Multinomial navive bayes'
    mnb = MultinomialNB(alpha=0.01)
    mnb = mnb.fit(train_data_features, train_data['label'])
    pred = mnb.predict(test_data_features)
    pred = pd.Series(pred, name='TARGET')
    # pred.to_csv('BOW_TFIDF_MNB5.csv', index=None, header=True)
    pred.to_csv('BI_W2V_MNB1.csv', index=None, header=True)

    # # KNN
    # print 'KNN'
    # knn = KNeighborsClassifier()
    # knn = knn.fit(train_data_features, train_data['label'])
    # pred = knn.predict(test_data_features)
    # pred = pd.Series(pred, name='TARGET')
    # pred.to_csv('BOW_TFIDF_KNN2.csv', index=None, header=True)

    # SVM
    print 'SVM'
    svm = SVC(kernel='linear')
    svm = svm.fit(train_data_features, train_data['label'])
    pred = svm.predict(test_data_features)
    pred = pd.Series(pred, name='TARGET')
    # pred.to_csv('BOW_TFIDF_SVM5.csv', index=None, header=True)
    pred.to_csv('BI_W2V_SVM1.csv', index=None, header=True)

    # GBDT
    print 'GBDT'
    gbdt = GradientBoostingClassifier()
    gbdt = gbdt.fit(train_data_features, train_data['label'])
    pred = gbdt.predict(test_data_features)
    pred = pd.Series(pred, name='TARGET')
    pred.to_csv('BI_W2V_GBDT1.csv', index=None, header=True)
예제 #16
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]

        print("Location: " + str(location) + ", location2: " + str(location2s))

        trainPreds = defaultdict(list)
        testPreds = defaultdict(list)

        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                    location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf=9,
                                              n_estimators=59,
                                              n_jobs=-1,
                                              random_state=42)
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)
                for x in testPrediction:
                    testPreds[tag].append(x)

        t2Y = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            t2Y = t2Y + trainY2

        labelt2Y = []

        for i in range(0, len(t2Y)):
            bestModel = 0
            bestAbs = abs(t2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(t2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelt2Y.append(bestModel)

        print("#labelt2Y:" + str(len(labelt2Y)))
        tX2 = []
        testX = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, tX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
            for row in tX:
                testX.append(row)

        for pred in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i])

        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)

        model = RandomForestClassifier(random_state=42,
                                       n_estimators=100,
                                       max_depth=15)
        model.fit(reducedTrainX2, labelt2Y)

        for pred in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i])

        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)

        pred = model.predict(reducedTestX)

        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))

        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)

    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
예제 #17
0
from sklearn.linear_model.ridge import Ridge
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm.classes import SVR
from sklearn.tree import DecisionTreeClassifier


DECISION_TREE = DecisionTreeClassifier()
LOGISTIC_REGRESSION = LogisticRegression()
NAIVE_BAYS = GaussianNB()

K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
예제 #18
0
def __get_classifier_model(classifier, args):
    """
    Convenience function for obtaining a classification model

    Args:
        classifier(str): A string indicating the name of the classifier
        args: An arguments object

    Returns:
        A classification model based on the given classifier string
    """
    # Make SGD Logistic Regression model the default
    model = SGDClassifier(loss='log',
                          penalty='l2',
                          shuffle=True,
                          n_iter=5,
                          n_jobs=-1,
                          random_state=179)
    if classifier == SVM:
        model = SVC(kernel=args.kernel,
                    class_weight="balanced",
                    cache_size=8096,
                    random_state=17,
                    probability=True)
    elif classifier == ADA_BOOST:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = AdaBoostClassifier(base_estimator=dt,
                                   n_estimators=400,
                                   random_state=17)
    elif classifier == RF:
        # Configure the classifier to use all available CPU cores
        model = RandomForestClassifier(class_weight="balanced",
                                       n_jobs=-1,
                                       n_estimators=400,
                                       random_state=17,
                                       max_features='auto',
                                       max_depth=15,
                                       criterion='gini')
    elif classifier == GRADIENT_BOOST:
        model = GradientBoostingClassifier(random_state=17,
                                           n_estimators=400,
                                           max_features='auto')
    elif classifier == EXTRA_TREES:
        model = ExtraTreesClassifier(random_state=17,
                                     n_estimators=400,
                                     n_jobs=-1,
                                     class_weight='balanced',
                                     max_depth=15,
                                     max_features='auto',
                                     criterion='gini')
    elif classifier == BAGGING:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = BaggingClassifier(base_estimator=dt,
                                  n_estimators=400,
                                  random_state=17,
                                  n_jobs=-1,
                                  max_features=0.8,
                                  max_samples=0.8,
                                  bootstrap=False)
    elif classifier == PASSIVE_AGGRESSIVE:
        model = PassiveAggressiveClassifier(n_iter=10,
                                            class_weight='balanced',
                                            n_jobs=-1,
                                            random_state=41)
    elif classifier == PERCEPTRON:
        model = Perceptron(n_jobs=-1,
                           n_iter=10,
                           penalty='l2',
                           class_weight='balanced',
                           alpha=0.25)
    return model
예제 #19
0
y = df.loc[:, task_target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

#:# model

params = {'max_depth': 5, 'n_estimators': 75}

classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 8187bc79526114bd041f226851977941
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(transform_pipeline.transform(X_test))[:,1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
예제 #20
0
def grid_search(model,
                xdata,
                ydata,
                mode,
                param_grid=None,
                cv_=None,
                n_iter_=None):

    if model == 'RF' and mode == 'RANDOMIZE':
        n_estimators = [
            int(x) for x in np.linspace(start=200, stop=2000, num=10)
        ]
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 4]
        bootstrap = [True, False]
        random_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap
        }
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator=rf,
                                       param_distributions=random_grid,
                                       n_iter=5,
                                       cv=2,
                                       verbose=2,
                                       random_state=0,
                                       n_jobs=-1)
        rf_random.fit(xdata, ydata)
        return rf_random.best_params_

    elif model == 'RF' and mode == 'FOCUSED':
        rf = RandomForestClassifier()
        rf_random = GridSearchCV(estimator=rf,
                                 param_grid=param_grid,
                                 cv=3,
                                 verbose=2,
                                 n_jobs=-1)
        rf_random.fit(xdata, ydata)
        return rf_random.best_params_

    elif model == 'RF' and mode == 'EXACT':
        res_matrix = np.zeros(
            (len(param_grid['n_estimators']), len(param_grid['max_depth']),
             len(param_grid['min_samples_leaf'])))
        for n_estimator_index, n_estimator in enumerate(
                param_grid['n_estimators']):
            for max_depth_index, max_depth in enumerate(
                    param_grid['max_depth']):
                for min_samples_leaf_index, min_samples_leaf in enumerate(
                        param_grid['min_samples_leaf']):
                    model = RandomForestClassifier(
                        n_jobs=-1,
                        max_depth=int(max_depth),
                        n_estimators=int(n_estimator),
                        min_samples_leaf=int(min_samples_leaf),
                        random_state=0)
                    predicted = cross_val_predict(model, xdata, ydata, cv=3)
                    res_matrix[max_depth_index, n_estimator_index,
                               min_samples_leaf_index] = accuracy_score(
                                   ydata, predicted)
                    print(
                        '\rGRID SEARCHING RF: processing set:| %s | %s | %s |'
                        % (n_estimator_index, max_depth_index,
                           min_samples_leaf_index))
        best_p = np.where(res_matrix == res_matrix.max())
        return res_matrix, (param_grid['n_estimators'][best_p[0][0]],
                            param_grid['max_depth'][best_p[1][0]],
                            param_grid['min_samples_leaf'][best_p[2][0]])

    elif model == 'GB' and mode == 'RANDOMIZE':

        loss = ['deviance', 'exponential']

        #There is a trade-off between learning_rate and n_estimators
        learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
        n_estimators = [10, 50, 100, 200]

        max_depth = [2, 4, 8]
        max_features = [5, 10, 'auto']

        min_samples_split = [2, 4, 8]
        min_samples_leaf = [1, 2, 4]

        random_grid = {
            'loss': loss,
            'learning_rate': learning_rates,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
        }

        gb = GradientBoostingClassifier()
        model_random = RandomizedSearchCV(estimator=gb,
                                          param_distributions=random_grid,
                                          n_iter=n_iter_,
                                          cv=cv_,
                                          verbose=2,
                                          random_state=0,
                                          n_jobs=-1)
        model_random.fit(xdata, ydata)

        return model_random

    elif model == 'GB' and mode == 'FOCUSED':
        gb = GradientBoostingClassifier()
        model_focused = GridSearchCV(estimator=gb,
                                     param_grid=param_grid,
                                     cv=cv_,
                                     verbose=2,
                                     n_jobs=-1)
        model_focused.fit(xdata, ydata)
        return model_focused
예제 #21
0
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")

### your code here!  name your classifier object clf if you want the 
clf=RandomForestClassifier()
clf.fit(features_train,labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

plt.show()
################################################################################




예제 #22
0
def train_model(X_train,
                y_train,
                model_name='logistic_regression',
                is_cv=False):
    """
            训练分类模型,默认为“逻辑回归”模型,默认不执行交叉验证
    """
    if model_name == 'logistic_regression':
        # 逻辑回归
        lr_model = linear_model.LogisticRegression()
        if is_cv:
            print '交叉验证...'
            params = {'C': [1e-4, 1e-3, 1e-2, 0.1, 1]}
            gs_model = GridSearchCV(lr_model,
                                    param_grid=params,
                                    cv=5,
                                    scoring='roc_auc',
                                    verbose=3)
            gs_model.fit(X_train, y_train)
            print '最优参数:', gs_model.best_params_
            best_model = gs_model.best_estimator_
        else:
            print '使用模型的默认参数...'
            lr_model.fit(X_train, y_train)
            best_model = lr_model

    elif model_name == 'svm':
        # 支持向量机
        svm_model = svm.SVC(probability=True)
        if is_cv:
            print '交叉验证...'
            #             params = {'kernel': ('linear', 'rbf'),
            #                       'C': [0.01, 0.1, 1, 10, 100]}
            params = {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
            }
            gs_model = GridSearchCV(svm_model,
                                    param_grid=params,
                                    cv=5,
                                    scoring='roc_auc',
                                    verbose=3)
            gs_model.fit(X_train, y_train)
            print '最优参数:', gs_model.best_params_
            best_model = gs_model.best_estimator_
        else:
            print '使用模型的默认参数...'
            svm_model.fit(X_train, y_train)
            best_model = svm_model

    elif model_name == 'random_forest':
        # 随机森林
        rf_model = RandomForestClassifier()
        if is_cv:
            print '交叉验证...'
            params = {'n_estimators': [20, 40, 60, 80, 100]}
            gs_model = GridSearchCV(rf_model,
                                    param_grid=params,
                                    cv=5,
                                    scoring='roc_auc',
                                    verbose=3)
            gs_model.fit(X_train, y_train)
            print '最优参数:', gs_model.best_params_
            best_model = gs_model.best_estimator_
        else:
            print '使用模型的默认参数...'
            rf_model.fit(X_train, y_train)
            best_model = rf_model

    else:
        # 可以自己添加更多模型用于学习
        print '暂不支持该模型...'

    return best_model
예제 #23
0
def evalOne(n_estimators, min_samples_leaf):
    Y = []
    P = []
    for group in range(0, 5):
        #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0, 5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
        #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [
            s for s in all_stations if float(s) in trainStations
        ]
        #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
        #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(
            trainStations, testStations, "location", data, columns, "target")

        train_lower = [
            float(trainStationList[i]) for i in range(0, len(trainStationList))
            if i < (len(trainStationList) / 2.0)
        ]
        #         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]

        test_lower = [
            float(testStationList[i]) for i in range(0, len(testStationList))
            if i < (len(testStationList) / 2.0)
        ]
        #         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]

        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)

        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)

        model = RandomForestClassifier(random_state=42,
                                       n_estimators=n_estimators,
                                       min_samples_leaf=min_samples_leaf,
                                       n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)

        Y.extend(testY)
        P.extend(predY)

    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy
예제 #24
0
     print("\t\ttest rmse: " + testRmse)
     trainPreds[tag] = train2Prediction
     testPreds[tag] = testPrediction
  
 labelt2Y = []
  
 for i in range(0, len(t2Y)):
     bestModel = 0
     bestAbs = abs(t2Y[i] - trainPreds[top10tags[0]][i])
     for j in range(0, len(top10tags)):
         tag = top10tags[j]
         modelAbs = abs(t2Y[i] - trainPreds[tag][i])
         if modelAbs < bestAbs:
             bestAbs = modelAbs
             bestModel = j
     labelt2Y.append(bestModel)
     
 trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
      
 model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
 model.fit(trainX2, labelt2Y)
 
 pred = model.predict(testX)
 
 finalPrediction = []
 for i in range(0, len(testY)):
     p = testPreds[top10tags[pred[i]]][i]
     finalPrediction.append(p)      
 rmse = str(rmseEval(testY, finalPrediction)[1])
 print("\tRMSE: " + str(rmse))
 
예제 #25
0
    hist = pickle.load(f)
print("got hists")
print(hist[0])
N = hist.shape[0]
I = permutation(N)
controls = np.array(controls)
Itr = I[:N // 2]
Ite = I[N // 2:]
Xtr = hist[Itr, :]
ttr = controls[Itr]

Xte = hist[Ite, :]
tte = controls[Ite]

print("split data", len(Xtr), len(Xte))
forest = RandomForestClassifier(n_estimators=2)
forest.fit(Xtr, ttr)
print("trained forest")

predictions = []
actual = []

for i in range(100):
    test = Xte[i]
    sum = 0
    for j in test:
        sum += j
    print("sum", sum)
    test = np.reshape(test, (1, 1764))
    pred = forest.predict(test)
예제 #26
0
    label_trainY = []
    for l in label_trainLocation:
        if l in train_lower:
            label_trainY.append(0)
        else:
            label_trainY.append(1)

    label_testY = []
    for l in label_testLocation:
        if l in test_lower:
            label_testY.append(0)
        else:
            label_testY.append(1)

    model = RandomForestClassifier(random_state=42,
                                   n_estimators=50,
                                   max_depth=4,
                                   n_jobs=-1)
    model.fit(label_trainX, label_trainY)
    predY = model.predict(label_testX)

    finalPred = []
    for i in range(0, len(predY)):
        if predY[i] == 0:
            finalPred.append(test_prediction_lower[i])
        else:
            finalPred.append(test_prediction_upper[i])

    rmse = rmseEval(testY, finalPred)[1]
    print("\tupper+lower TW rmse: " + str(rmse))
    pred_combined.extend(finalPred)
예제 #27
0
print(x_valid.shape)
x_test = x_test.reshape((x_test.shape[0], x_test.shape[2]))
print(x_test.shape)


ground = []
test = []
for i in range(y_train.shape[0]):
	ground.append(np.argmax(y_train[i]))

for i in range(y_test.shape[0]):
	test.append(np.argmax(y_test[i]))

# print(ground)

clf = RandomForestClassifier(n_estimators=4, max_depth=2, random_state=0)
# print(clf)
clf.fit(x_train, ground)

# print(clf.feature_importances_)
rf = clf.predict(x_test)
print(rf)
acc_rf = accuracy_score(test, rf)
print(acc_rf)
cm_rf = confusion_matrix(test, rf)
print(cm_rf)

knn = KNeighborsClassifier(n_neighbors = 20).fit(x_train, ground) 
# accuracy = knn.score(x_test, test) 
y_pred = knn.predict(x_test)
acc = accuracy_score(test, y_pred)
예제 #28
0
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'max_features':[1, 3, 9],
                         'max_samples':[1, 5, 9, 21],
                         'random_state':[1, 2, 3, 5]
                     }]
    algo = BaggingClassifier()
    
elif choice=='h' or choice=='H':
    print("\n**********************************\n")
    print("  \t Random Forest")
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'criterion':['gini', 'entropy'],
                         'max_features':['log2', 'sqrt'],
                         'max_depth':[10, 100]
                     }]
    algo = RandomForestClassifier()

elif choice=='i' or choice=='I':
    print("\n**********************************\n")
    print("  \t AdaBoost Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
                         'learning_rate':[0.1, 0.2, 0.5, 1],
                         'algorithm':['SAMME', 'SAMME.R'],
                         'random_state':[1, 2, 3, 5]
                     }]
    algo = AdaBoostClassifier()
    
elif choice=='j' or choice=='J':
    print("\n**********************************\n")
    print("  \t Gradient Boosting Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
예제 #29
0
    def RF_trainandtest_kfold(self,
                              nsplit,
                              cv,
                              feature_sel,
                              varthreshold,
                              ntrees,
                              nodes,
                              rfmethod,
                              nclusters=10,
                              cmethod=None):

        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']

        #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集
        kf = KFold(n_splits=nsplit, shuffle=True)
        predresult = pd.DataFrame()
        for train_index, test_index in kf.split(data_feature):
            X_train, X_test = data_feature.iloc[
                train_index, ], data_feature.iloc[test_index, ]
            y_train, y_test = data_target.iloc[
                train_index, ], data_target.iloc[test_index, ]

            #如果随机抽样造成train或者test中只有一个分类,跳过此次预测
            if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1):
                continue

            #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
            X_train, X_test = self.binandwoe_traintest(X_train, y_train,
                                                       X_test, nclusters,
                                                       cmethod)

            #在train中做变量筛选, sklearn.feature_selection中的方法
            if feature_sel == "VarianceThreshold":
                selector = VarianceThreshold(threshold=varthreshold)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "RFECV":
                estimator = LogisticRegression()
                selector = RFECV(estimator, step=1, cv=cv)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectFromModel":
                estimator = LogisticRegression()
                selector = SelectFromModel(estimator)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectKBest":
                selector = SelectKBest()
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            else:
                X_train1, X_test1 = X_train, X_test

            #训练并预测随机森林模型
            if rfmethod == 'RandomForest':
                classifier = RandomForestClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)
            elif rfmethod == 'ExtraTrees':
                classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                                  min_samples_split=nodes * 2,
                                                  min_samples_leaf=nodes)
            elif rfmethod == 'GradientBoosting':
                classifier = GradientBoostingClassifier(
                    n_estimators=ntrees,
                    min_samples_split=nodes * 2,
                    min_samples_leaf=nodes)

            classifier.fit(X_train1, y_train)
            probability = classifier.predict_proba(X_test1)[:, 1]

            temp = pd.DataFrame({'target': y_test, 'probability': probability})
            predresult = pd.concat([predresult, temp], ignore_index=True)

        return predresult
예제 #30
0
x = iris.data[:, [2, 3]]  # petal.length, petal.width
y = iris.target

# 학습용 / 검정용 자료로 분리.
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0
)  # test_szie : 분석할 데이터와 테스트데이터를 분리(0.3 : 0.7), random_state : 난수 고정 값.

# 표준화(Scaling) - 전처리 : 안정성, 수렴속도를 향상. 오버플로, 언더플로 방지 효과.
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

#ml = svm.SVC() # SVM 모델 생성.
ml = RandomForestClassifier(criterion='entropy',
                            n_estimators=10)  # n_estimators : 의사결정 갯수

result = ml.fit(x_train_std, y_train)  # 학습자료로 모델 실행.
print(result)

y_pred = ml.predict(x_test_std)  # 학습모델을 검정
print('실제값 : ', y_test)
print('추정값 : ', y_pred)

print('총 검정 수 %d, 오류 수 %d' % (len(y_test), (y_test != y_pred).sum()))

# 정확도 구하기 1
print('분류 정확도 : %.3f' % accuracy_score(y_test, y_pred))

# 정확도 구하기 2
import pandas as pd