def test_unigram_all(): pos_origin_file = '../data/rule_word_origin_pos.txt' neg_origin_file = '../data/rule_word_origin_neg.txt' pos_stem_file = '../data/rule_word_stem_pos.txt' neg_stem_file = '../data/rule_word_stem_neg.txt' svm_origin_score = test_unigram(pos_origin_file, neg_origin_file, NuSVC()) svm_stem_score = test_unigram(pos_stem_file, neg_stem_file, NuSVC()) bnb_origin_score = test_unigram(pos_origin_file, neg_origin_file, BernoulliNB()) bnb_stem_score = test_unigram(pos_stem_file, neg_stem_file, BernoulliNB()) rfc_origin_score = test_unigram(pos_origin_file, neg_origin_file, RandomForestClassifier()) rfc_stem_score = test_unigram(pos_stem_file, neg_stem_file, RandomForestClassifier()) if not os.path.exists('../result'): os.mkdir('../result') with open('../result/unigram_result.txt', 'wt', encoding='utf-8') as f: f.write('original word result:\n') f.write('\t\t SVM: {0:.3f}%\n'.format(svm_origin_score * 100)) f.write('\t\t BNB: {0:.3f}%\n'.format(bnb_origin_score * 100)) f.write('\t\t RFC: {0:.3f}%\n'.format(rfc_origin_score * 100)) f.write('\n stem result:\n') f.write('\t\t SVM: {0:.3f}%\n'.format(svm_stem_score * 100)) f.write('\t\t BNB: {0:.3f}%\n'.format(bnb_stem_score * 100)) f.write('\t\t RFC: {0:.3f}%\n'.format(rfc_stem_score * 100))
def data_feature_importance(self, features_list, title="Feature Importance"): from sklearn.preprocessing import LabelEncoder, Imputer from sklearn.cross_validation import train_test_split # extraemos las columnas con los features clf_data = self.dataframe.loc[:, features_list] # Preprocesaos los datos y los ajustamos cat_feats_to_use = list(clf_data.select_dtypes(include=object).columns) for feat in cat_feats_to_use: encoder = LabelEncoder() clf_data[feat] = encoder.fit_transform(clf_data[feat]) # Llenamos los valores vacios num_feats_to_use = list(clf_data.select_dtypes(exclude=object).columns) for feat in num_feats_to_use: imputer = Imputer(strategy='median') clf_data[feat] = imputer.fit_transform( clf_data[feat].values.reshape(-1, 1)) # Separamos el index de loas Fetures X = clf_data.iloc[:, 1:] y = clf_data.iloc[:, 0] # the target were the first column I included # Entrenamos con los datos recivido x_train, _, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=35) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train = scaler.fit_transform(x_train) from sklearn.ensemble.forest import RandomForestClassifier # inicializamos el clasificador clf = RandomForestClassifier(n_estimators=8, random_state=34) clf.fit(x_train, y_train) # Pasamos los datos a un Dataframe para poder graficarlos feats_imp = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['FeatureImportance']) feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False) feats_imp.plot(kind='barh', figsize=(12, 6), legend=False) plt.title(title) sns.despine(left=True, bottom=True) plt.gca().invert_yaxis() plt.savefig(self.DefeaultPath + " feature importance.png", dpi=200) plt.cla() plt.clf() return
def plot_tree(profile, group, avg_acc, n_tree, picutre): ''' 选择最优的随机种子并建模 :param profile: 丰度表 :param group: 分组表 :param avg_acc: 随机种子准确率的输出文件 :param n_tree: 模型中树的颗数 :param picutre: 输出图形的名称 :return: None ''' acc = pd.read_csv(avg_acc, sep='\t', header=0, index_col=0) best_state = int(acc.sort_values('avgAcc').index[-1]) # 训练和保存预测模型 rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3, random_state=best_state) rf.fit(profile, group['label']) joblib.dump(rf, 'rf.pkl') # 绘制分类树结果图形 dot = picutre.split('.')[0] + '.dot' tree_in_forest = rf.estimators_[rf.n_estimators-1] export_graphviz(tree_in_forest, out_file=dot, feature_names=profile.columns, filled=True, rounded=False, precision=100) os.system('dot -Tpng {0} -o {1}'.format(dot, picutre))
def estimate_weights_random_forests(X_s, X_t, X_w): X_all, all_labels = prepare_data_for_weights_estimation(X_s, X_t) # train logistic regression kf = KFold(X_all.shape[0], 10, shuffle=True) param_grid_rf = [{ "n_estimators": np.array([500]), "max_depth": np.array([6]), # "max_features": np.array([1, 2, 4, 8, 16]), "min_samples_leaf": np.array([100]) }] rf = GridSearchCV(RandomForestClassifier(50, max_depth=10, class_weight="auto", n_jobs=-1), param_grid_rf, cv=kf, n_jobs=-1) rf = RandomForestClassifier(100, max_depth=6, min_samples_leaf=200, class_weight="auto", n_jobs=-1) rf.fit(X_all, all_labels) # print "best parameters for rf weights determination: ", rf.best_estimator_ probas = rf.predict_proba(X_w) weights = probas[:, 1] / probas[:, 0] return weights
def mymap(data, N): data = cPickle.loads(str(data)) x = data[:, :-1] y = data[:, -1] model = RandomForestClassifier(n_estimators=N, max_depth=6) model = model.fit(x, y) return cPickle.dumps(model)
def test_imdb_padded_valid(self): num_samples = 32 num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words, num_subsamples=num_samples) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs): self._initial_forrest_size = n_estimators * initial_forrest_factor self._final_forrest_size = n_estimators rf_fit_args = copy(kwargs) rf_fit_args.update({'n_estimators': self._initial_forrest_size}) self._rf = RandomForestClassifier(**rf_fit_args)
class MyRfClassifier(BaseClassifier): def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier( **{ 'verbose': 1, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': 40 }) self.name = "rf_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf }) def get_name(self): return self.name def fit(self, X, y, X_t, y_t): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs): self._initial_forrest_size = n_estimators * initial_forrest_factor self._final_forrest_size = n_estimators rf_fit_args = copy(kwargs) rf_fit_args.update({'n_estimators': self._initial_forrest_size}) self._rf = RandomForestClassifier(**rf_fit_args) def fit(self, X, y): self._rf.fit(X, y) for t in self._rf.estimators_: tree_y_pred = t.predict(X) t._error_vector = squared_error_vector(y, tree_y_pred) final_estimators = [] for i in range(self._final_forrest_size): final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_)) self._rf.estimators_ = final_estimators self._rf.n_estimators = self._final_forrest_size # TODO: Set other self._rf parameters to match correct size so that predict works. def predict(self, X, y=None): return self._rf.predict(X)
def enemy_detection_clf(): chars = np.array(['warrior', 'warlock', 'mage', 'druid', 'rogue', 'shaman', 'paladin', 'priest', 'hunter']) data = [] target = [] for c in chars: p = path('images/character/new/black') for f in os.listdir(p+'/'+c): img = Image.open(p+'/'+c+'/'+f) w, h = img.size pixel = img.load() tmp = [] for y in range(h): for x in range(w): tmp.append(np.float(pixel[x,y] / 255)) target.append(np.str(c)) data.append(np.array(tmp)) data = np.array(data) #image = data.view() #image.shape = (-1, 22, 30) #clf = svm.SVC(gamma = 0.001) clf = RandomForestClassifier() clf.fit(data, target) return clf
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, random_state=0, min_density=None): clf = RandomForestClassifier() clf.fit(x_train,Y_train) return clf
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None, *args): clf = RandomForestClassifier() clf.fit(x_train,Y_train) return clf
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def test_nlp_erroneous_rnn_args_invalid(self): num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset( max_value=num_words) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) with self.assertRaises(ValueError): _ = RNNModelBuilder( with_embedding=True, verbose=0) # Must also specify the embedding_size argument. model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, verbose=0) input_layer = Input(shape=(10, 2)) with self.assertRaises(ValueError): model_builder.build(input_layer) input_layer = Input(shape=(10, 3)) with self.assertRaises(ValueError): model_builder.build(input_layer)
def __init__(self, path): ''' Constructor ''' self.path = path self.model = RandomForestClassifier(n_estimators=150,n_jobs=8) self.model_name = 'rf'
def test_RandomForest(self): X = [[0, 1], [1, 1]] Y = [0, 1] regression = RandomForestClassifier(n_estimators=10) regression = regression.fit(X, Y) regression.predict_proba(X)
def importance(): df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv') features = list(df.columns.values) target = 'm_column_result' features.remove('m_match_id') features.remove('m_column_result') features.remove('m_match_date') features.remove('m_goals_home') features.remove('m_goals_away') features.remove('a_next_match_id') features.remove('h_next_match_id') features.remove('m_favorite') features.remove('m_medium') features.remove('m_underdog') features.remove('h_last_match_local') features.remove('a_last_match_local') features.remove('rf1000') features.remove('rf1000_fs1') X = df[features] y = df[target] # fit an Extra Trees model to the data clf = RandomForestClassifier(n_estimators=1000) clf.fit(X, y) # display the relative importance of each attribute for x,y in zip(features,clf.feature_importances_): print (x,y)
def stkFoldCrossValidation(): X = pickle.load(open('X.p', 'rb')) X = np.array(X) Y = pickle.load(open('Y.p', 'rb')) Y = np.array(Y) skf = StratifiedKFold(n_splits=10) skf.get_n_splits(X, Y) k = 1 for train_index, test_index in skf.split(X, Y): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] print(k) k += 1 rf = RandomForestClassifier() rf.fit(X_train, Y_train) yp = rf.predict(X_test) print(classification_report(Y_test, yp, digits=6))
def rforest_classify(X,Y): #clf = RandomForestClassifier(criterion='gini',max_features=7,n_estimators=100,n_jobs=3,min_samples_leaf=5) clf = RandomForestClassifier(n_estimators=500, \ criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1 \ ,max_features='auto', bootstrap=False, oob_score=False, n_jobs=-1, min_density=None) clf.fit(X,Y) return clf
def RF_Features_Importance(X, Y, outputfile="RF.csv"): forest = RandomForestClassifier(n_estimators=300) forest.fit(X, Y) importances = np.matrix(forest.feature_importances_).tolist()[0] df = pd.DataFrame(list(zip(header, importances)), columns=["Features", "Importance"]) df.to_csv(outputfile, index=False)
def forest(X, y, model_path): model = RandomForestClassifier() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def __init__(self, sig_weight=1., pow_sig=1., pow_bg=1., n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto", bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None): RandomForestClassifier.__init__(self) # Everything should be set via set_params self.sig_weight = sig_weight self.pow_bg = pow_bg self.pow_sig = pow_sig
def calcRandomForestClassifier(channels_training, channels_testing, target_training, target_testing): clf = RandomForestClassifier(n_estimators=500, max_features=int( sqrt(len(channels_training[0])))) clf = clf.fit(channels_training, target_training) predictions = clf.predict(channels_testing) comp = [predictions, target_testing, channels_testing] return clf, comp
def calc_score(test, train): test_f, test_l = split_data_label(test) train_f, train_l = split_data_label(train) # 학습시키고 정답률 구하기 clf = RandomForestClassifier() clf.fit(train_f, train_l) pre = clf.predict(test_f) return metrics.accuracy_score(test_l, pre)
def train_rf(train_vec, train_label): from sklearn.ensemble.forest import RandomForestClassifier as RFC # rfrclf = RFR(n_estimators=1001) # rfrclf.fit(train_vec, train_label) # print rfrclf.feature_importances_ trfclf = RFC(n_estimators=1001) trfclf.fit(train_vec, train_label) # print rfclf.feature_importances_ return trfclf
def drawfeature(train_data_path,train_file_name,test_data_path,test_file_name): train_file = os.path.join(train_data_path,train_file_name) train_data = pd.read_csv(train_file) n_data_train = train_data['text'].size print 'n_data_train is %s' %n_data_train print type(n_data_train) test_file = os.path.join(test_data_path,test_file_name) test_data = pd.read_csv(test_file) n_data_test = test_data['text'].size print 'n_data_test is %s' %n_data_test print type(n_data_test) vectorizer = CountVectorizer(analyzer='word',tokenizer = None, preprocessor = None, stop_words=None, max_features = 5000) transformer = TfidfTransformer() train_data_words = [] print 'start with words in train data set' for i in xrange(n_data_train): if((i+1)%1000 == 0): print 'Drawfeatures line %d of %d' %(i+1,n_data_train) train_data_words.append(words_to_features(train_data['text'][i])) print 'start bag of words in train data....' train_data_features = vectorizer.fit_transform(train_data_words) train_data_features = train_data_features.toarray() print 'start tfidf in train data....' train_data_features = transformer.fit_transform(train_data_features) train_data_features = train_data_features.toarray() #test-data processing test_data_words = [] for i in xrange(n_data_test): if((i+1)%1000 == 0): print 'Drawfeatures line %d of %d' %(i+1,n_data_test) test_data_words.append(words_to_features(test_data['text'][i])) test_data_features = vectorizer.fit_transform(test_data_words) test_data_features = test_data_features.toarray() print'randome forest go...' forest = RandomForestClassifier(n_estimators = 13) forest = forest.fit(train_data_features,train_data['label']) pred = forest.predict(test_data_features) pred = pd.Series(pred,name='Target') pred.to_csv('SENTI_RF.CSV',index=None, header = None) print'naive baby go...' mnb = MultinomialNB(alpha=0.01) mnb = mnb.fit(train_data_features,train_data['label']) pred = mnb.predict(test_data_features) pred = pd.Series(pred,name = 'Target') pred.to_csv('SENTI_MNB',index = None, header = True)
def main(args): if args.analyse != None: train_data_x, test_data_x,train_data_y, test_data_y = process_data(args.analyse) RT = RandomForestClassifier(n_estimators=100) RT.fit(train_data_x, train_data_y) print RT.score(test_data_x, test_data_y) return
def random_forest_classifier(features, target): """ To train the random forest classifier with features and target data :param features: :param target: :return: trained random forest classifier """ clf = RandomForestClassifier(n_estimators=600, max_depth=50) clf.fit(features, target) return clf
def my_digits(): digits = _data() n_samples = len(digits.images) datas = digits.images.reshape((n_samples, -1)) classifier = RandomForestClassifier() classifier.fit(datas, digits.target) return classifier
def test_unigram_all(): # test_unigram(NuSVC(), 'SVM', '../data/pos_origin.txt', '../data/neg_origin.txt', 'origin') # test_unigram(NuSVC(), 'SVM', '../data/pos_stem.txt', '../data/neg_stem.txt', 'stem') test_unigram(BernoulliNB(), 'BNB', '../data/pos_origin.txt', '../data/neg_origin.txt', 'origin') test_unigram(BernoulliNB(), 'BNB', '../data/pos_stem.txt', '../data/neg_stem.txt', 'stem') test_unigram(RandomForestClassifier(), 'RFC', '../data/pos_origin.txt', '../data/neg_origin.txt', 'origin') test_unigram(RandomForestClassifier(), 'RFC', '../data/pos_stem.txt', '../data/neg_stem.txt', 'stem')
def RandomForestClassifer(self): ''' Function to do RandomForest Classifer. ''' train_Array = self.titanic_train_frame.values self.test_Array = self.titanic_test_frame.values randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1) randomForest.fit(train_Array[0::,1::],train_Array[0::,0]) self.predicted_probability = randomForest.predict(self.test_Array[0::,0::]) self.predicted_probability_list = self.predicted_probability.tolist()
class RFClassifier(super.abstract_classifier): def __init__(self, train_features, train_labels, num_of_trees): self.train_features = train_features self.train_labels = train_labels self.rf_member = RandomForestClassifier(num_of_trees) def train(self): self.rf_member.fit(self.train_features, self.train_labels) def classify(self, newVector): return self.rf_member.predict(newVector)
def evalOne(enabledColumns): features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]] Y = [] P = [] for group in range(0,5): # print("Test group " + str(group + 1)) trainStationList = [] testStationList = [] for i in range(0,5): if i == group: testStationList.extend(groups[i]) else: trainStationList.extend(groups[i]) trainStations = set(float(station) for station in trainStationList) # reorder train stations # print("\ttrainStationList:" + str(trainStationList)) trainStationList = [s for s in all_stations if float(s) in trainStations] # print("\ttrainStationList:" + str(trainStationList)) testStations = set(float(station) for station in testStationList) # print("\ttestStationList:" + str(testStationList)) trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target") train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)] # train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)] test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)] # test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)] trainY = [] for l in trainLocation: if l in train_lower: trainY.append(0) else: trainY.append(1) testY = [] for l in testLocation: if l in test_lower: testY.append(0) else: testY.append(1) model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1) model.fit(trainX, trainY) predY = model.predict(testX) Y.extend(testY) P.extend(predY) f1 = f1_score(Y, P) accuracy = accuracy_score(Y, P) return f1, accuracy
def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def train_classifier(vocal_frames, non_vocal_frames): frames = np.append(vocal_frames, non_vocal_frames, axis=0) labels_vocal = np.ones(vocal_frames.shape[0]) labels_non_vocal = np.zeros(non_vocal_frames.shape[0]) labels = np.append(labels_vocal, labels_non_vocal, axis=0) rfc = RandomForestClassifier(n_estimators=100, max_depth=None) rfc.fit(frames, labels) return rfc
def RF(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelRF=RandomForestClassifier(n_estimators=10, max_depth=5,max_features=1,random_state=0) modelRF.fit(train_desc,np.array(train_labels)) joblib.dump((modelRF, img_classes, stdSlr), pth+"/rf-bof.pkl", compress=3) test(pth, "rf-")
def train(self, log_probs, labels): ''' Train the stacked classifier @param log_probs: list of probability matrices (channels, label) @param labels: label for each feature-vector ''' print 'Train stacked classifier with %d windows' % labels.shape[0] log_probs = [self.channel_sort.sort(f) for f in log_probs] log_probs = np.vstack(log_probs) # TODO: classifier as Parameter self.stacked_classifier = RandomForestClassifier(n_estimators=10) self.stacked_classifier.fit(log_probs, labels)
def train_model(X_train,y_train): print("training the model ...") # create sets for probability calibration X_train_train, X_prob_cal, y_train_train, y_prob_cal = train_test_split(X_train, y_train, test_size=0.2) rf = RandomForestClassifier( max_features="auto", n_estimators=2000, max_depth=8, n_jobs=-1, class_weight = 'balanced', verbose=1) rf.fit(X_train_train,y_train_train) # feature importances # feature_importance = False # if(feature_importance): # # importances = rf.feature_importances_ # std = np.std([tree.feature_importances_ for tree in rf.estimators_], # axis=0) # indices = np.argsort(importances)[::-1] # col_names = df.drop('bin',axis=1).columns.values # print("Feature ranking:") # # for f in range(X_train_train.shape[1]): # print("%d. %s (%f)" % (f + 1, col_names[indices[f]], importances[indices[f]])) # # # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") # plt.bar(range(X_train_train.shape[1]), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(X_train_train.shape[1]), col_names[indices],rotation = 50) # plt.xlim([-1, X_train_train.shape[1]]) # plt.show() # Probability calibration sig_clf = CalibratedClassifierCV(rf, method="sigmoid", cv="prefit") sig_clf.fit(X_prob_cal, y_prob_cal) y_pred_train = sig_clf.predict_proba(X_train) print(".. training log_loss : {:0.2f} %".format(log_loss(y_train,y_pred_train)*100)) return sig_clf
class Model(BaseModel): """Antares implementation of scikit learn random forest classifier """ def __init__(self, categorical_features=None, n_estimators=50, n_jobs=-1, max_depth=10): ''' Example: >>> from madmex.modeling.supervised.rf import Model >>> rf = Model() >>> # Write model to db >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no') >>> # Read model from db >>> rf2 = Model.from_db('test_model') ''' super().__init__(categorical_features=categorical_features) self.model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, max_depth=max_depth) self.model_name = 'rf' def fit(self, X, y): X = self.hot_encode_training(X) self.model.fit(X, y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' X = self.hot_encode_predict(X) return self.model.predict(X) def predict_confidence(self, X): """Get confidence of every prediction """ X = self.hot_encode_predict(X) return self.model.predict_proba(X).max(axis=1) def score(self, X, y): ''' Test the model given a dataset and a target vector. This method applies the model that this object represents to the given dataset using the response variable y. It is a measure of the accuracy of the trained model. Usually the orginal dataset should be splitted in training and testing subsets to cross validate the model. ''' return self.model.score(X, y)
def classic_model(image_dir, image_lists, method): X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method) classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4) classifier.fit(X, y) X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method) predictions = classifier.predict(X_test) confusion = pandas.crosstab(y_test, predictions, rownames=['Actual Class'], colnames=['Predicted Class']) print confusion return accuracy_score(y_test, predictions)
def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier( **{ 'verbose': 1, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': 40 }) self.name = "rf_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf })
def try_model(train): print(train.shape) features = ["phone_brand", "device_model", "event_count", "action_radius_max", "medianTime", "minTime", "maxTime", "weekday", "appcounts1"] encoder = LabelEncoder() train["group"] = encoder.fit_transform(train["group"].values) rf = RandomForestClassifier(n_estimators=50, max_depth=15, max_features=6, bootstrap=True, n_jobs=4, random_state=2016, class_weight=None) rf.fit(train[features].values, train["group"].values) feature_importance(rf, features) skf = StratifiedKFold(train["group"].values, n_folds=5, shuffle=True, random_state=2016) scores = cross_val_score(rf, train[features].values, train["group"].values, scoring="log_loss", cv=skf, n_jobs=1) print(scores) print("RF Score: %0.5f" %(-scores.mean())) # RF Score: 2.39884
def just_pred(x, y): xlen = len(x) i = range(xlen) np.random.shuffle(i) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen,:] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:,:] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) return ytest, ypred
def crossval(x, y, k=5): for i in range(k): i = range(len(X)) np.random.shuffle(i) xlen = len(x) trainpct = 0.7 trainlen = int(trainpct * xlen) testlen = xlen - trainlen xtrain = x.ix[:trainlen,:] ytrain = y.ix[:trainlen] xtest = x.ix[trainlen:,:] ytest = y.ix[trainlen:] rf = RandomForestClassifier() rf.fit(xtrain, ytrain) ypred = rf.predict(xtest) print ypred
def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators, 'max_depth':max_depth,'min_samples_leaf':min_samples_leaf, 'n_jobs':40}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} )
def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000, n_jobs=25): self.classifier = RandomForestClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': n_jobs}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} )
class MyRfClassifier(BaseClassifier): def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators, 'max_depth':max_depth,'min_samples_leaf':min_samples_leaf, 'n_jobs':40}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y, X_t, y_t): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
class MyRandomForestClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000, n_jobs=25): self.classifier = RandomForestClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': n_jobs}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): return self.classifier.feature_importances_
def initDecTrees(self, path): for filename in os.listdir(path): if filename=='train.csv': with open(os.path.join(path,filename)) as infile: f = csv.reader(infile) aux = f.next() # skip the header x = [] y = [] for line in f: if size(line) > 1: if self.option == 1: data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9])] y.append(converter(line[6])) x.append(data) elif self.option == 2: auxDeputy = fetchDeputyParty(line[2]) data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9]), encodeParty(auxDeputy['party']), encodeState(auxDeputy['state'])] y.append(converter(line[6])) x.append(data) clf = RandomForestClassifier(n_estimators=5) clf.fit(x, y) return clf
def drawfeature(train_data_path='./train', train_filename='train_cleaned',test_data_path='./test', test_filename='test_cleaned'): train_file = os.path.join(train_data_path, train_filename) train_data = pd.read_csv(train_file) n_train_data = train_data['text'].size test_file = os.path.join(test_data_path,test_filename) test_data = pd.read_csv(test_file) n_test_data = test_data['text'].size vectorizer = CountVectorizer(analyzer="word",tokenizer=None, preprocessor=None, stop_words=None, max_features=2000) transformer = TfidfTransformer() train_data_words = [] for i in xrange(n_train_data): train_data_words.append(words_to_features(train_data['text'][i])) train_data_features = vectorizer.fit_transform(train_data_words) train_data_features = train_data_features.toarray() train_data_features = transformer.fit_transform(train_data_features) train_data_features = train_data_features.toarray() train_data_pd=pd.Series(train_data_features,name=None) train_data_pd.to_csv("trainfeature.csv", index=None, header=True) test_data_words = [] for i in xrange(n_test_data): test_data_words.append(words_to_features(test_data['text'][i])) test_data_features = vectorizer.fit_transform(test_data_words) test_data_features = test_data_features.toarray() test_data_features = transformer.fit_transform(test_data_features) test_data_features = test_data_features.toarray() test_data_pd=pd.Series(test_data_features,name=None) test_data_pd.to_csv("testfeature.csv", index=None, header=True) forest = RandomForestClassifier(n_estimators=60) forest = forest.fit(train_data_features, train_data['lable']) pred = forest.pedict(test_data_features) pred = pd.Series(pred,name='Target') pred.to_csv("bow_tfidf_RF.csv", index=None, header=True)
audit_y = audit_y.astype(int) print(audit_X.dtype, audit_y.dtype) def predict_audit(classifier): adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"]) return pandas.concat((adjusted, adjusted_proba), axis = 1) audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5) audit_tree.fit(audit_X, audit_y) store_pkl(audit_tree, "DecisionTreeAudit.pkl") store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv") audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5) audit_forest.fit(audit_X, audit_y) store_pkl(audit_forest, "RandomForestAudit.pkl") store_csv(predict_audit(audit_forest), "RandomForestAudit.csv") audit_regression = LogisticRegression() audit_regression.fit(audit_X, audit_y) store_pkl(audit_regression, "RegressionAudit.pkl") store_csv(predict_audit(audit_regression), "RegressionAudit.csv") # # Multi-class classification #
test_size=0.25, random_state=666) train_ind = X_train.index.values test_ind = X_test.index.values g_train = g.iloc[train_ind,:] g_test = g.iloc[test_ind,:] clf = tree.DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=3) #################### clf = RandomForestClassifier(criterion='gini', max_depth = 6, min_samples_leaf=3, n_estimators=50) #################### clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=3), n_estimators = 200, learning_rate = 0.1) #################### clf = neighbors.KNeighborsClassifier(100, weights='uniform') clf = neighbors.KNeighborsClassifier(100, weights='distance') #################### clf = GaussianNB() ############################## t0 = time() param_grid = {'C': [150, 500, 750, 1000],
### the training data (features_train, labels_train) have both "fast" and "slow" ### points mixed together--separate them so we can give them different colors ### in the scatterplot and identify them visually grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0] bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0] grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1] bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1] #### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast") plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") ### your code here! name your classifier object clf if you want the clf=RandomForestClassifier() clf.fit(features_train,labels_train) ### draw the decision boundary with the text points overlaid prettyPicture(clf, features_test, labels_test) plt.show() ################################################################################
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset=dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset=deal_with_nulls(dealing_with_nulls,dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors=dataset.drop(resp_var,axis=1,inplace=False) predictors=pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp=dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds,resp) #-------get the important predictors feature_imp = pd.Series(rf_clf.feature_importances_, index=list(predictors.iloc[:,0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp=pd.DataFrame(data=resp,columns=[resp_var]) predictors=pd.DataFrame(prds,columns=list(predictors)) dataset=pd.concat([resp,predictors],axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data= dataset.drop(resp_var, axis = 1,inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402) # Instantiate model with n_estimators decision trees clf = SVC(kernel='rbf',probability=True) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels,predicted)) #precision score precision = precision_score(test_labels,predicted,pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels,pred_prob[:,[1]]) #recall score rec = recall_score(test_labels,predicted,pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels,predicted,pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels,predicted,beta=0.5) #hamming_loss hamming = hamming_loss(test_labels,predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels,predicted) #logloss logloss = log_loss(test_labels,predicted) #zero-oneloss zero_one = zero_one_loss(test_labels,predicted) #auc roc area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]]) #cohen_score cohen = cohen_kappa_score(test_labels,predicted) #mathews corr mathews = matthews_corrcoef(test_labels,predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews} output=json.dumps(output) return jsonify({"Predictions": output})
#'Date', 'WnvPresent_DateTrapSpecies'], axis=1) # Create dfs based on mask X_train = X[~msk] X_test = X[msk] y_train = y[~msk] y_test = y[msk] return X_train, X_test, y_train, y_test # Create classifiers clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5) clf = xgbwrapper.XgbWrapper({'objective': 'binary:logistic', 'eval_metric': 'auc', 'eta': 0.05, 'silent': 1}) # Cross validation if do_cross_val: # Leave-one-year-out cross-validation scores = [] total_pred = np.array([]) total_test = np.array([])
def model_pred(trainX,trainY,testX,model_type): if model_type == "rf": clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20) clf.fit(trainX,trainY) pred = clf.predict(testX) if model_type == "gbdt": clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) pred = clf.predict(testX) if model_type == "fusion": prob = np.zeros(len(testX)) params = [100,200,300,400,500] for param in params: clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' params = [1,2,3,4,5,6,7,8,9,10] for param in params: clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' pred = list(prob >= 3) print "the pos rate is:",float(sum(pred))/len(pred) return pred
if 'FIGURES' in line: break vals = line.strip().split('\t') text = vals[2] corpus_test.append(text) if int(vals[0]) == 0: y_test.append('0') else: y_test.append('1') X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) clf = RandomForestClassifier(n_estimators=10) #clf = KNeighborsClassifier(n_neighbors=10) #clf = LinearSVC() clf.fit(X_train, y_train) print len(y_train) print len(y_test) pred = clf.predict(X_test) #pred = ['0']* len(y_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) total.append(score)
results = { 'problem': [], 'method': [], 'score': [] } if len(sys.argv) > 1 and sys.argv[1] == '--skip-train': results = pd.read_csv("./data/results.csv") else: for classification_dataset in classification_dataset_names: print("Starting", classification_dataset) X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/') train_X, test_X, train_y, test_y = train_test_split(X, y) rf = RandomForestClassifier() lexRF = LexicaseForestClassifier() rf.fit(train_X, train_y) lexRF.fit(train_X, train_y) rf_score = rf.score(test_X, test_y) lexRF_score = lexRF.score(test_X, test_y) results['problem'] = results['problem'] + ([classification_dataset] * 2) results['method'] = results['method'] + ['RF', 'LexRF'] results['score'].append(rf_score) results['score'].append(lexRF_score) results = pd.DataFrame(results) results.to_csv("./data/results.csv", index=False)