titanic['Sex'] = titanic['Sex'].apply(lambda s: 0 if s == 'male' else 1) print(titanic) x = titanic.iloc[:, [1, 3, 4]] y = titanic.iloc[:, 0] # 학습용 / 검정용 자료 분리. titanic_x, test_x, titanic_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0) # print(titanic_x.head()) # print(titanic_y.head()) ml = RandomForestClassifier(criterion='entropy').fit( titanic_x, titanic_y) # (criterion='entropy', random_state = 0) print(ml) titanic_pred_y = ml.predict(test_x) print('실제값 : ', test_y) print('추정값 : ', titanic_pred_y) print('총 검정 수 %d, 오류 수 %d' % (len(test_y), (test_y != titanic_pred_y).sum())) print('분류 정확도 : %.3f' % accuracy_score(test_y, titanic_pred_y)) # 교차검증 print('교차검증 : \n', model_selection.cross_val_score(ml, titanic_x, titanic_y, cv=5)) # 새로운 데이터로 예측 new_data = np.array([[1, 0, 24], [1, 1, 43], [2, 1, 48], [3, 0, 33],
x = dataset.iloc[:, 2:-1].values y = dataset.iloc[:, -1].values # Splitting into TestSet and Training Set from sklearn.model_selection import train_test_split xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler scX = StandardScaler() xTrain = scX.fit_transform(xTrain) xTest = scX.transform(xTest) # Fitting Classifier on Training Set from sklearn.ensemble.forest import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(xTrain, yTrain) # Predict Test Set Results yPred = classifier.predict(xTest) # Make Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(yTest, yPred) # Visualising Training Set Results from matplotlib.colors import ListedColormap xSet, ySet = xTrain, yTrain X1, X2 = np.meshgrid(np.arange(start = xSet[:, 0].min() - 1, stop = xSet[:, 0].max() + 1, step = 0.01), np.arange(start = xSet[:, 1].min() - 1, stop = xSet[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
def RF_trainandtest(self, testsize, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters=10, cmethod=None): #分割数据集为训练集和测试集 data_feature = self.data.ix[:, self.data.columns != 'default'] data_target = self.data['default'] X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, test_size=testsize, random_state=0) #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold=varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1)[:, 1] predresult = pd.DataFrame({ 'target': y_test, 'probability': probability }) return predresult
def runns(resp_var, size_of_test_data, dataset, positive_class, predictor_var, n_estimators, important_features, dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset = dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset = deal_with_nulls(dealing_with_nulls, dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors = dataset[predictor_var] predictors = pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp = dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds, resp) #-------get the important predictors feature_imp = pd.Series( rf_clf.feature_importances_, index=list(predictors.iloc[:, 0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp = pd.DataFrame(data=resp, columns=[resp_var]) predictors = pd.DataFrame(prds, columns=list(predictors)) dataset = pd.concat([resp, predictors], axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data = dataset.drop(resp_var, axis=1, inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split( dataset, resp_var, test_size=float(size_of_test_data), random_state=402) # Instantiate model with n_estimators decision trees clf = RandomForestClassifier(n_jobs=1, n_estimators=n_estimators, random_state=142) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels, predicted)) #precision score precision = precision_score(test_labels, predicted, pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels, pred_prob[:, [1]]) #recall score rec = recall_score(test_labels, predicted, pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels, predicted, pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels, predicted, beta=0.5) #hamming_loss hamming = hamming_loss(test_labels, predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels, predicted) #logloss logloss = log_loss(test_labels, predicted) #zero-oneloss zero_one = zero_one_loss(test_labels, predicted) #auc roc area_under_roc = roc_auc_score(test_labels, pred_prob[:, [1]]) #cohen_score cohen = cohen_kappa_score(test_labels, predicted) #mathews corr mathews = matthews_corrcoef(test_labels, predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output = { "accuracy": accuracy, "precision": precision, "average precision": avg_precision, "recall": rec, "fscore": fscore, "fbeta": fbeta, "hamming": hamming, "jaccard": jaccard, "logloss": logloss, "zero_one": zero_one, "area_under_roc": area_under_roc, "cohen": cohen, "mathews": mathews } output = json.dumps(output) return output
train_8 = pd.read_csv("./future/train_all_split_8.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) train_9 = pd.read_csv("./future/train_all_split_9.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) train_10 = pd.read_csv("./future/train_all_split_10.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) train_11 = pd.read_csv("./future/train_all_split_11.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) train_12 = pd.read_csv("./future/train_all_split_12.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) test_2 = pd.read_csv("./future/test_v2_all_future.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) results_df = pd.DataFrame(index=test_2.index) resultCol = pd.Series(index=test_2.index,dtype=object) #try random forest on feature A n_estimators = 100 rfcModel = RandomForestClassifier(n_estimators=n_estimators) lr = LogisticRegression() #svr = SVR(C=1.0, epsilon=0.2) #C = 10.0 # SVM regularization parameter #svc = svm.SVC(kernel='linear', C=C) #rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C) #poly_svc = svm.SVC(kernel='poly', degree=3, C=C) def filter_features(train_df, model, prodStr='A'): input_df = train_df.copy(deep=True) #input_df = cleanup(input_df) y = input_df[prodStr].values input_df=input_df.drop([prodStr], axis=1)
from uci_comparison import compare_estimators from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier from rr_forest import RRForestClassifier from rr_extra_forest import RRExtraTreesClassifier estimators = { 'RandomForest': RandomForestClassifier(n_estimators=20), 'RndRotForest': RRForestClassifier(n_estimators=20), 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), 'RndRotETrees': RRExtraTreesClassifier(n_estimators=20), } # optionally, pass a list of UCI dataset identifiers as the datasets parameter, e.g. datasets=['iris', 'diabetes'] # optionally, pass a dict of scoring functions as the metric parameter, e.g. metrics={'F1-score': f1_score} compare_estimators(estimators)
index = 16 cls.fit(xtrain, train_label) d = xtest[index] d.shape = (28, 28) plt.imshow(d, cmap='gray') plt.show() print(cls.predict([xtest[index]])) pre = cls.predict(xtest) count = 0 for i in range(0, 21000): count += 1 if pre[i] == test_label[i] else 0 print("Accuracy", (count / 21000) * 100) rf = RandomForestClassifier(n_estimators=100) rf.fit(xtrain, train_label) pre = rf.predict(xtest) count = 0 for i in range(len(pre)): count += 1 if pre[i] == test_label[i] else 0 print("Accuracy", (count / len(pre)) * 100) pre = rf.predict(test) print(pre) sub.head() sub.Label = pre sub.head() sub.info()
X_path = feature + '_features.h5' # Loading the cars dataset features test_size = 0.3 cars_train_X, cars_test_X, cars_train_y, cars_test_y = split_dataset( X_path, feature, test_size) cars_train_X = np.asarray(cars_train_X).reshape( cars_train_X.shape[0], np.prod(cars_train_X.shape[1:])) cars_test_X = np.asarray(cars_test_X).reshape(cars_test_X.shape[0], np.prod(cars_test_X.shape[1:])) #cls = RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1) # classifier is random forest, if desired to use bagging with 5 subsets, remove the comments below # the argument max_features=0.2 splits the features in 1/5 of the total features cls = RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1, max_features=0.2) cls.fit(cars_train_X, cars_train_y) # if desired to show accuracy and std-dev values for the training set, uncomment the 3 lines below # scores = cross_val_score(cls, cars_train_X, cars_train_y, cv=5, verbose=True) # print(scores) # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # save predictions for plotting the heatmap y_pred = cls.predict(cars_test_X) with open('bagging.sav', 'wb') as f: pkl.dump((y_pred, cars_test_y), f) y_true = cars_test_y
def result(): if request.method == 'POST': path = request.files.get('myFile') # Reading the CSV file and converting it into a pandas data-frame df = pd.read_csv(path, encoding="ISO-8859-1") # Reading the name for the file for the model that will be saved filename = request.form['filename'] # Reading the names of the feature and label as strings str1 = request.form['feature'] str2 = request.form['label'] # Assigning the feature and label variables to the respective columns if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') ''' # Removing the punctuations and HTTP links in the feature text input x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) ''' X = X.str.lower() # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ # Splitting the data-set into 2 parts : Training data and Test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) # Fitting all the classification models one by one and recording their accuracies and execution times start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf, y_train) pred = clf11.predict(tfidfvect.transform(X_test)) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf, y_train) pred = clf12.predict(tfidfvect.transform(X_test)) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use. acu_list = [a1, a2, a3, a4, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac11=a11, ac12=a12)
from sklearn.ensemble.forest import RandomForestClassifier from sklearn.metrics.classification import accuracy_score from sklearn.metrics import confusion_matrix iris = datasets.load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) # sklearn provides the iris species as integer values since this is required for classification # here we're just adding a column with the species names to the dataframe for visualisation df['species'] = np.array([iris.target_names[i] for i in iris.target]) # sns.pairplot(df, hue='species') # plt.show() X_train, X_test, y_train, y_test = train_test_split(df[iris.feature_names], iris.target, test_size=0.5, stratify=iris.target, random_state=123456) rf = RandomForestClassifier( n_estimators=100, oob_score=True, random_state=123456) #Uses RandomForestClassifer with 100 trees rf.fit(X_train, y_train) predicted = rf.predict(X_test) accuracy = accuracy_score(y_test, predicted) print(f'Out-of-bag score estimate: {rf.oob_score_:.3}') print(f'Mean accuracy score: {accuracy:.3}') cm = pd.DataFrame(confusion_matrix(y_test, predicted), columns=iris.target_names, index=iris.target_names) sns.heatmap(cm, annot=True) plt.show()
test_data['age'] = test_data['age'].apply(lambda x: age_disperse(x)) test_data_disperse, label_encoders, onehot_encoder \ = onehot_encoder_disperse(test_data[disperse_cols], disperse_cols, label_encoders, onehot_encoder) test_data = test_data.reset_index() test_data = pd.concat([test_data_disperse, test_data[continuous_cols]], axis=1, ignore_index=True) train_size = len(train_data) data = pd.concat([train_data, test_data]) print(data.columns[104]) data = standardized(data) # test_data = train_data random_forest = RandomForestClassifier(n_estimators=10, verbose=1, max_depth=10) random_forest.fit(X=data.iloc[:train_size, :-1], y=train_data.iloc[:train_size, -1]) prediction = random_forest.predict(data.iloc[train_size:, :-1]) # mlp = MLPClassifier(hidden_layer_sizes=[20, 40, 20 ], verbose=1) # mlp.fit(X=train_data.iloc[:, :-1], y=train_data.iloc[:, -1]) # prediction = mlp.predict(test_data.iloc[:, :-1]) counter = 0 for i in range(len(prediction)): y = data.iloc[train_size + i, -1] if prediction[i] == y: counter += 1 # else: # print('ground truth: {0}, prediction: {1}'.format(prediction[i], y)) print('accuracy: {0}'.format(counter / len(prediction)))
print(featureScores.nlargest(20, 'Score')) #print 10 best features #2 extratree classifier model = ExtraTreesClassifier() model.fit(nyse_scaled, eps) print(model.feature_importances_ ) #use inbuilt class feature_importances of tree based classifiers #plot graph of feature importances for better visualization feat_importances = pd.Series(model.feature_importances_, index=nyse_scaled.columns) feat_importances.nlargest(20).plot(kind='barh') plt.show() #3 randomforest rf_exp = SelectFromModel( RandomForestClassifier(n_estimators=100, random_state=100)) rf_exp.fit(nyse_scaled, eps) rf_exp.get_support() selected_feat = nyse_scaled.columns[(rf_exp.get_support())] print(nyse_scaled.columns[rf_exp.get_support()]) nyse_scaled1 = nyse_scaled[[ 'Other Operating Items', 'Pre-Tax Margin', 'Profit Margin', 'Operating Margin', 'Long-Term Investments', 'Non-Recurring Items', 'Common Stocks', 'Depreciation', 'Accounts Payable', 'Cost of Revenue', 'Total Current Liabilities', 'Total Revenue', 'Other Liabilities', 'Net Receivables', 'Research and Development', 'Total Current Assets', 'Inventory', 'Deferred Liability Charges', 'Total Liabilities & Equity', 'Total Assets' ]] eps1 = nyse.iloc[:, 75:76]
unique_test_pixels=pd.unique(test.values.ravel()) #print "Unique Test Values:", unique_test_pixels #Check if pixels in test are a subset of pixels in train. If yes, easier to do predictions #print "Test values in Train?", np.in1d(unique_test_pixels, unique_train_pixels, assume_unique=True) #Check if there is linear correlation between pixel<x> columns and label #If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data. #In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher #print "Correlation:", train.corr()["label"] #Check that the algorithm used gives good accuracy by using part of the training set to validate train_train, train_test=train_test_split(train, test_size=0.3) #Train model model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20) #model=KNeighborsClassifier(n_neighbors=6) #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array #model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int))) #print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int))) #print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3) model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel()) print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel())
index_col=0) train_sets[12] = train_12 test_2 = pd.read_csv("./future/test_v2_all_future.csv", header=0, encoding="UTF-8", error_bad_lines=False, sep=",", index_col=0) results_df = pd.DataFrame(index=test_2.index) resultCol = pd.Series(index=test_2.index, dtype=object) n_estimators = 30 rfcModel = RandomForestClassifier(n_estimators=n_estimators, max_features=None) rfrmodel = RandomForestRegressor(n_estimators=n_estimators) lr = LogisticRegression() #svr = SVR(C=1.0, epsilon=0.2) #C = 10.0 # SVM regularization parameter #svc = svm.SVC(kernel='linear', C=C) #rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C) #poly_svc = svm.SVC(kernel='poly', degree=3, C=C) def cleanup(df): #find columns that contain null values inds = pd.isnull(df).any(0).nonzero()
def drawfeature( TRAIN_DATA_PATH='/home/samuelchan/PycharmProjects/emotion-analysis/train', train_filename='thetrain.csv', TEST_DATA_PATH='/home/samuelchan/PycharmProjects/emotion-analysis/test', test_filename='thetest.csv'): train_file = os.path.join(TRAIN_DATA_PATH, train_filename) train_data = pd.read_csv(train_file) n_data_train = train_data['text'].size test_file = os.path.join(TEST_DATA_PATH, test_filename) test_data = pd.read_csv(test_file) n_data_test = test_data['text'].size # # bag of words model + tfidf # vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # transformer = TfidfTransformer() # bigram + tf vectorizer = HashingVectorizer(ngram_range=(2, 2), non_negative=True) # train print 'Start cut word in train data set' train_data_word = [] for i in xrange(n_data_train): if ((i + 1) % 1000 == 0): print 'Drawfeatures Line %d of %d' % (i + 1, n_data_train) train_data_word.append(word_to_feature(train_data['text'][i])) # print 'Start bag of word in train data set' # # draw features # train_data_features = vectorizer.fit_transform(train_data_word) # # train_data_features = train_data_features.toarray() # print 'Start tfidf in train data set' # train_data_features = transformer.fit_transform(train_data_features) # # train_data_features = train_data_features.toarray() print 'Start bigram model in train data set' train_data_features = vectorizer.fit_transform(train_data_word) # test print 'Start cut words in test data set' test_data_words = [] for i in xrange(n_data_test): if ((i + 1) % 1000 == 0): print 'Drawfeatures Line %d of %d' % (i + 1, n_data_test) test_data_words.append(word_to_feature(test_data['text'][i])) # # draw feature # print 'Start bag of word in test data set' # test_data_features = vectorizer.fit_transform(test_data_words) # # test_data_features = test_data_features.toarray() # print 'Start tfidf in test data set' # test_data_features = transformer.fit_transform(test_data_features) # # test_data_features = test_data_features.toarray() print 'Start bigram model in test data set' test_data_features = vectorizer.fit_transform(test_data_words) # random forest print 'random forest' forest = RandomForestClassifier(n_estimators=100) forest = forest.fit(train_data_features, train_data['label']) pred = forest.predict(test_data_features) pred = pd.Series(pred, name='TARGET') # pred.to_csv('BOW_TFIDF_RF5.csv', index=None, header=True) pred.to_csv('BI_W2V_RF1.csv', index=None, header=True) # multinomial navive bayes print 'Multinomial navive bayes' mnb = MultinomialNB(alpha=0.01) mnb = mnb.fit(train_data_features, train_data['label']) pred = mnb.predict(test_data_features) pred = pd.Series(pred, name='TARGET') # pred.to_csv('BOW_TFIDF_MNB5.csv', index=None, header=True) pred.to_csv('BI_W2V_MNB1.csv', index=None, header=True) # # KNN # print 'KNN' # knn = KNeighborsClassifier() # knn = knn.fit(train_data_features, train_data['label']) # pred = knn.predict(test_data_features) # pred = pd.Series(pred, name='TARGET') # pred.to_csv('BOW_TFIDF_KNN2.csv', index=None, header=True) # SVM print 'SVM' svm = SVC(kernel='linear') svm = svm.fit(train_data_features, train_data['label']) pred = svm.predict(test_data_features) pred = pd.Series(pred, name='TARGET') # pred.to_csv('BOW_TFIDF_SVM5.csv', index=None, header=True) pred.to_csv('BI_W2V_SVM1.csv', index=None, header=True) # GBDT print 'GBDT' gbdt = GradientBoostingClassifier() gbdt = gbdt.fit(train_data_features, train_data['label']) pred = gbdt.predict(test_data_features) pred = pd.Series(pred, name='TARGET') pred.to_csv('BI_W2V_GBDT1.csv', index=None, header=True)
def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) trainPreds = defaultdict(list) testPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2( location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) for x in testPrediction: testPreds[tag].append(x) t2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2( location, location2, "location", data, all_features, "target") t2Y = t2Y + trainY2 labelt2Y = [] for i in range(0, len(t2Y)): bestModel = 0 bestAbs = abs(t2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(t2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelt2Y.append(bestModel) print("#labelt2Y:" + str(len(labelt2Y))) tX2 = [] testX = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, tX, testY = splitDataForXValidationSampled2( location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for row in tX: testX.append(row) for pred in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelt2Y) for pred in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
from sklearn.linear_model.ridge import Ridge from sklearn.naive_bayes import BernoulliNB, GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm.classes import SVR from sklearn.tree import DecisionTreeClassifier DECISION_TREE = DecisionTreeClassifier() LOGISTIC_REGRESSION = LogisticRegression() NAIVE_BAYS = GaussianNB() K_N_N = KNeighborsClassifier() SUPPORT_VECTOR = svm.SVC(kernel="linear") # Ensemble classifiers RANDOM_FOREST = RandomForestClassifier(n_estimators=100) GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100) ADA_BOOST = AdaBoostClassifier(n_estimators=100) EXTRA_TREE = ExtraTreesClassifier(n_estimators=100) # Regressors GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) LINEAR_RG = LinearRegression() RIDGE_RG = Ridge() LASSO_RG = Lasso() SVR_RG = SVR() def getClassifierMap(): CLASSIFIER_MAP = { "DECISION_TREE": DECISION_TREE,
def __get_classifier_model(classifier, args): """ Convenience function for obtaining a classification model Args: classifier(str): A string indicating the name of the classifier args: An arguments object Returns: A classification model based on the given classifier string """ # Make SGD Logistic Regression model the default model = SGDClassifier(loss='log', penalty='l2', shuffle=True, n_iter=5, n_jobs=-1, random_state=179) if classifier == SVM: model = SVC(kernel=args.kernel, class_weight="balanced", cache_size=8096, random_state=17, probability=True) elif classifier == ADA_BOOST: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = AdaBoostClassifier(base_estimator=dt, n_estimators=400, random_state=17) elif classifier == RF: # Configure the classifier to use all available CPU cores model = RandomForestClassifier(class_weight="balanced", n_jobs=-1, n_estimators=400, random_state=17, max_features='auto', max_depth=15, criterion='gini') elif classifier == GRADIENT_BOOST: model = GradientBoostingClassifier(random_state=17, n_estimators=400, max_features='auto') elif classifier == EXTRA_TREES: model = ExtraTreesClassifier(random_state=17, n_estimators=400, n_jobs=-1, class_weight='balanced', max_depth=15, max_features='auto', criterion='gini') elif classifier == BAGGING: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = BaggingClassifier(base_estimator=dt, n_estimators=400, random_state=17, n_jobs=-1, max_features=0.8, max_samples=0.8, bootstrap=False) elif classifier == PASSIVE_AGGRESSIVE: model = PassiveAggressiveClassifier(n_iter=10, class_weight='balanced', n_jobs=-1, random_state=41) elif classifier == PERCEPTRON: model = Perceptron(n_jobs=-1, n_iter=10, penalty='l2', class_weight='balanced', alpha=0.25) return model
y = df.loc[:, task_target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'max_depth': 5, 'n_estimators': 75} classifier = RandomForestClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# 8187bc79526114bd041f226851977941 md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba(transform_pipeline.transform(X_test))[:,1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
def grid_search(model, xdata, ydata, mode, param_grid=None, cv_=None, n_iter_=None): if model == 'RF' and mode == 'RANDOMIZE': n_estimators = [ int(x) for x in np.linspace(start=200, stop=2000, num=10) ] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=5, cv=2, verbose=2, random_state=0, n_jobs=-1) rf_random.fit(xdata, ydata) return rf_random.best_params_ elif model == 'RF' and mode == 'FOCUSED': rf = RandomForestClassifier() rf_random = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1) rf_random.fit(xdata, ydata) return rf_random.best_params_ elif model == 'RF' and mode == 'EXACT': res_matrix = np.zeros( (len(param_grid['n_estimators']), len(param_grid['max_depth']), len(param_grid['min_samples_leaf']))) for n_estimator_index, n_estimator in enumerate( param_grid['n_estimators']): for max_depth_index, max_depth in enumerate( param_grid['max_depth']): for min_samples_leaf_index, min_samples_leaf in enumerate( param_grid['min_samples_leaf']): model = RandomForestClassifier( n_jobs=-1, max_depth=int(max_depth), n_estimators=int(n_estimator), min_samples_leaf=int(min_samples_leaf), random_state=0) predicted = cross_val_predict(model, xdata, ydata, cv=3) res_matrix[max_depth_index, n_estimator_index, min_samples_leaf_index] = accuracy_score( ydata, predicted) print( '\rGRID SEARCHING RF: processing set:| %s | %s | %s |' % (n_estimator_index, max_depth_index, min_samples_leaf_index)) best_p = np.where(res_matrix == res_matrix.max()) return res_matrix, (param_grid['n_estimators'][best_p[0][0]], param_grid['max_depth'][best_p[1][0]], param_grid['min_samples_leaf'][best_p[2][0]]) elif model == 'GB' and mode == 'RANDOMIZE': loss = ['deviance', 'exponential'] #There is a trade-off between learning_rate and n_estimators learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1] n_estimators = [10, 50, 100, 200] max_depth = [2, 4, 8] max_features = [5, 10, 'auto'] min_samples_split = [2, 4, 8] min_samples_leaf = [1, 2, 4] random_grid = { 'loss': loss, 'learning_rate': learning_rates, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, } gb = GradientBoostingClassifier() model_random = RandomizedSearchCV(estimator=gb, param_distributions=random_grid, n_iter=n_iter_, cv=cv_, verbose=2, random_state=0, n_jobs=-1) model_random.fit(xdata, ydata) return model_random elif model == 'GB' and mode == 'FOCUSED': gb = GradientBoostingClassifier() model_focused = GridSearchCV(estimator=gb, param_grid=param_grid, cv=cv_, verbose=2, n_jobs=-1) model_focused.fit(xdata, ydata) return model_focused
### the training data (features_train, labels_train) have both "fast" and "slow" ### points mixed together--separate them so we can give them different colors ### in the scatterplot and identify them visually grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0] bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0] grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1] bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1] #### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast") plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") ### your code here! name your classifier object clf if you want the clf=RandomForestClassifier() clf.fit(features_train,labels_train) ### draw the decision boundary with the text points overlaid prettyPicture(clf, features_test, labels_test) plt.show() ################################################################################
def train_model(X_train, y_train, model_name='logistic_regression', is_cv=False): """ 训练分类模型,默认为“逻辑回归”模型,默认不执行交叉验证 """ if model_name == 'logistic_regression': # 逻辑回归 lr_model = linear_model.LogisticRegression() if is_cv: print '交叉验证...' params = {'C': [1e-4, 1e-3, 1e-2, 0.1, 1]} gs_model = GridSearchCV(lr_model, param_grid=params, cv=5, scoring='roc_auc', verbose=3) gs_model.fit(X_train, y_train) print '最优参数:', gs_model.best_params_ best_model = gs_model.best_estimator_ else: print '使用模型的默认参数...' lr_model.fit(X_train, y_train) best_model = lr_model elif model_name == 'svm': # 支持向量机 svm_model = svm.SVC(probability=True) if is_cv: print '交叉验证...' # params = {'kernel': ('linear', 'rbf'), # 'C': [0.01, 0.1, 1, 10, 100]} params = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1e-5, 1e-4, 1e-3, 1e-2, 0.1], } gs_model = GridSearchCV(svm_model, param_grid=params, cv=5, scoring='roc_auc', verbose=3) gs_model.fit(X_train, y_train) print '最优参数:', gs_model.best_params_ best_model = gs_model.best_estimator_ else: print '使用模型的默认参数...' svm_model.fit(X_train, y_train) best_model = svm_model elif model_name == 'random_forest': # 随机森林 rf_model = RandomForestClassifier() if is_cv: print '交叉验证...' params = {'n_estimators': [20, 40, 60, 80, 100]} gs_model = GridSearchCV(rf_model, param_grid=params, cv=5, scoring='roc_auc', verbose=3) gs_model.fit(X_train, y_train) print '最优参数:', gs_model.best_params_ best_model = gs_model.best_estimator_ else: print '使用模型的默认参数...' rf_model.fit(X_train, y_train) best_model = rf_model else: # 可以自己添加更多模型用于学习 print '暂不支持该模型...' return best_model
def evalOne(n_estimators, min_samples_leaf): Y = [] P = [] for group in range(0, 5): # print("Test group " + str(group + 1)) trainStationList = [] testStationList = [] for i in range(0, 5): if i == group: testStationList.extend(groups[i]) else: trainStationList.extend(groups[i]) trainStations = set(float(station) for station in trainStationList) # reorder train stations # print("\ttrainStationList:" + str(trainStationList)) trainStationList = [ s for s in all_stations if float(s) in trainStations ] # print("\ttrainStationList:" + str(trainStationList)) testStations = set(float(station) for station in testStationList) # print("\ttestStationList:" + str(testStationList)) trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation( trainStations, testStations, "location", data, columns, "target") train_lower = [ float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0) ] # train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)] test_lower = [ float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0) ] # test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)] trainY = [] for l in trainLocation: if l in train_lower: trainY.append(0) else: trainY.append(1) testY = [] for l in testLocation: if l in test_lower: testY.append(0) else: testY.append(1) model = RandomForestClassifier(random_state=42, n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, n_jobs=-1) model.fit(trainX, trainY) predY = model.predict(testX) Y.extend(testY) P.extend(predY) f1 = f1_score(Y, P) accuracy = accuracy_score(Y, P) return f1, accuracy
print("\t\ttest rmse: " + testRmse) trainPreds[tag] = train2Prediction testPreds[tag] = testPrediction labelt2Y = [] for i in range(0, len(t2Y)): bestModel = 0 bestAbs = abs(t2Y[i] - trainPreds[top10tags[0]][i]) for j in range(0, len(top10tags)): tag = top10tags[j] modelAbs = abs(t2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelt2Y.append(bestModel) trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(trainX2, labelt2Y) pred = model.predict(testX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[top10tags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse))
hist = pickle.load(f) print("got hists") print(hist[0]) N = hist.shape[0] I = permutation(N) controls = np.array(controls) Itr = I[:N // 2] Ite = I[N // 2:] Xtr = hist[Itr, :] ttr = controls[Itr] Xte = hist[Ite, :] tte = controls[Ite] print("split data", len(Xtr), len(Xte)) forest = RandomForestClassifier(n_estimators=2) forest.fit(Xtr, ttr) print("trained forest") predictions = [] actual = [] for i in range(100): test = Xte[i] sum = 0 for j in test: sum += j print("sum", sum) test = np.reshape(test, (1, 1764)) pred = forest.predict(test)
label_trainY = [] for l in label_trainLocation: if l in train_lower: label_trainY.append(0) else: label_trainY.append(1) label_testY = [] for l in label_testLocation: if l in test_lower: label_testY.append(0) else: label_testY.append(1) model = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=4, n_jobs=-1) model.fit(label_trainX, label_trainY) predY = model.predict(label_testX) finalPred = [] for i in range(0, len(predY)): if predY[i] == 0: finalPred.append(test_prediction_lower[i]) else: finalPred.append(test_prediction_upper[i]) rmse = rmseEval(testY, finalPred)[1] print("\tupper+lower TW rmse: " + str(rmse)) pred_combined.extend(finalPred)
print(x_valid.shape) x_test = x_test.reshape((x_test.shape[0], x_test.shape[2])) print(x_test.shape) ground = [] test = [] for i in range(y_train.shape[0]): ground.append(np.argmax(y_train[i])) for i in range(y_test.shape[0]): test.append(np.argmax(y_test[i])) # print(ground) clf = RandomForestClassifier(n_estimators=4, max_depth=2, random_state=0) # print(clf) clf.fit(x_train, ground) # print(clf.feature_importances_) rf = clf.predict(x_test) print(rf) acc_rf = accuracy_score(test, rf) print(acc_rf) cm_rf = confusion_matrix(test, rf) print(cm_rf) knn = KNeighborsClassifier(n_neighbors = 20).fit(x_train, ground) # accuracy = knn.score(x_test, test) y_pred = knn.predict(x_test) acc = accuracy_score(test, y_pred)
tuned_parameters = [{'n_estimators':[5, 10, 100, 200], 'max_features':[1, 3, 9], 'max_samples':[1, 5, 9, 21], 'random_state':[1, 2, 3, 5] }] algo = BaggingClassifier() elif choice=='h' or choice=='H': print("\n**********************************\n") print(" \t Random Forest") tuned_parameters = [{'n_estimators':[5, 10, 100, 200], 'criterion':['gini', 'entropy'], 'max_features':['log2', 'sqrt'], 'max_depth':[10, 100] }] algo = RandomForestClassifier() elif choice=='i' or choice=='I': print("\n**********************************\n") print(" \t AdaBoost Classifier") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200], 'learning_rate':[0.1, 0.2, 0.5, 1], 'algorithm':['SAMME', 'SAMME.R'], 'random_state':[1, 2, 3, 5] }] algo = AdaBoostClassifier() elif choice=='j' or choice=='J': print("\n**********************************\n") print(" \t Gradient Boosting Classifier") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
def RF_trainandtest_kfold(self, nsplit, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters=10, cmethod=None): data_feature = self.data.ix[:, self.data.columns != 'default'] data_target = self.data['default'] #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集 kf = KFold(n_splits=nsplit, shuffle=True) predresult = pd.DataFrame() for train_index, test_index in kf.split(data_feature): X_train, X_test = data_feature.iloc[ train_index, ], data_feature.iloc[test_index, ] y_train, y_test = data_target.iloc[ train_index, ], data_target.iloc[test_index, ] #如果随机抽样造成train或者test中只有一个分类,跳过此次预测 if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1): continue #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold=varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier( n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1)[:, 1] temp = pd.DataFrame({'target': y_test, 'probability': probability}) predresult = pd.concat([predresult, temp], ignore_index=True) return predresult
x = iris.data[:, [2, 3]] # petal.length, petal.width y = iris.target # 학습용 / 검정용 자료로 분리. x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=0 ) # test_szie : 분석할 데이터와 테스트데이터를 분리(0.3 : 0.7), random_state : 난수 고정 값. # 표준화(Scaling) - 전처리 : 안정성, 수렴속도를 향상. 오버플로, 언더플로 방지 효과. sc = StandardScaler() sc.fit(x_train) x_train_std = sc.transform(x_train) x_test_std = sc.transform(x_test) #ml = svm.SVC() # SVM 모델 생성. ml = RandomForestClassifier(criterion='entropy', n_estimators=10) # n_estimators : 의사결정 갯수 result = ml.fit(x_train_std, y_train) # 학습자료로 모델 실행. print(result) y_pred = ml.predict(x_test_std) # 학습모델을 검정 print('실제값 : ', y_test) print('추정값 : ', y_pred) print('총 검정 수 %d, 오류 수 %d' % (len(y_test), (y_test != y_pred).sum())) # 정확도 구하기 1 print('분류 정확도 : %.3f' % accuracy_score(y_test, y_pred)) # 정확도 구하기 2 import pandas as pd