def getRandomForestClf(self, X, Y, param_list): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 8 tmpHighDepth = 30 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators" : sp_randint(5, 12), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) else: if not param_list is None: clf = rf() clf.set_params(**param_list) clf.fit(X,Y) return clf
def train(): # Training the data using Random Forest algorithm from sklearn.ensemble import RandomForestRegressor as rf df = pd.read_csv(training_data) df_ = df[include] df_ = df_.dropna() # One-hot encoding categorical variables categoricals = [] for col, col_type in df_.dtypes.iteritems(): if col_type == 'O': categoricals.append(col) else: # Fill NaNs with 0 df_[col].fillna(0, inplace=True) # get_dummies() effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # Capture a list of columns that will be used for prediction global model_columns model_columns = list(x.columns) joblib.dump(model_columns, model_columns_file_name) global clf clf = rf() clf.fit(x, y) joblib.dump(clf, model_file_name) global predicted_value predicted_value = -1 return redirect(url_for('main'))
def train(df): df_ = df[include] print("Training data sample:\n", df_.head()) categoricals = [] # going to one-hot encode categorical variables for col, col_type in df_.dtypes.items(): if col_type == 'O': categoricals.append(col) else: df_[col].fillna( 0, inplace=True) # fill NA's with 0 for ints/floats, too generic # get_dummies effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # capture a list of columns that will be used for prediction model_columns = list(x.columns) print("Model columns are", model_columns) model = rf() start = time.time() model.fit(x, y) print('Trained in %.1f seconds' % (time.time() - start)) print('Model training score: %s' % model.score(x, y)) return model_columns, model
def buildModel(self): from sklearn.linear_model import LogisticRegression nums = [str(i) for i in range(8)] df = pd.read_csv('data/explodedNoOpenings.csv') numbers = [str(i) for i in range(8)] cols = [ col for col in df.columns if col not in [ 'white_rating', 'opening', 'black_rating', 'id', 'victory_type', "Unnamed: 0" ] ] print(cols) cols = [col for col in cols if col[0] not in numbers] features_to_concat = [df] features_to_concat.append(pd.get_dummies(df['winner'], prefix='winner')) df = pd.concat(features_to_concat, axis=1) cols = [ col for col in cols if col not in ['winner', 'target', 'winner_white', 'winner_black'] ] df = pd.concat(features_to_concat, axis=1) print(df[cols].shape) clf = rf().fit(df[cols], df['winner_white']) joblib.dump(clf, 'data/model.pikl') import json f = open('data/features', 'w') f.write(json.dumps({'features': cols})) f.close() print(cols)
def save_model(): # 读取数据 df = pd.read_csv('train.csv') include = ['Age', 'Sex', 'Embarked', 'Survived'] df_ = df[include] # only using 4 variables categoricals = [] for col, col_type in df_.dtypes.iteritems(): if col_type == 'O': categoricals.append(col) else: df_[col].fillna(0, inplace=True) df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) # using a random forest classifier (can be any classifier) dependent_variable = 'Survived' x = df_ohe[df_ohe.columns.difference([dependent_variable])] print(x.head()) y = df_ohe[dependent_variable] clf = rf() clf.fit(x, y) # 保存模型 dump(clf, model_file_name) # 保存模型列 global model_columns model_columns = list(x.columns) dump(model_columns, model_columns_file_name)
def get_model(choice='lr', class_weight=None): if choice == 'svc': model = svc(verbose=1, class_weight=class_weight, n_jobs=-1) elif choice == 'lsvc': model = lsvc(class_weight=class_weight, n_jobs=-1) elif choice == 'knn': model = KNeighborsClassifier() elif choice == 'msvm': model = MulticlassSVM(C=0.1, tol=0.01, max_iter=100, random_state=0, verbose=1) elif choice == 'gnb': model = gnb(class_weight=class_weight) elif choice == 'gpc': model = gpc(class_weight=class_weight) elif choice == 'sgdc': model = sgdc(class_weight=class_weight) elif choice == 'rf': model = rf(class_weight=class_weight) # elif choice == 'vw': # model = vw() else: model = lr(class_weight=class_weight) return model
def genAcc(train, trainLabel, test, testLabel, features): clf = rf(n_estimators=100) train = train[:, features] test = test[:, features] clf.fit(train, trainLabel) val = clf.score(test, testLabel) return val
def treenorms(classes,classes_test,norms,n_cluster,data_res): i=1 j=4 k=41 k_f=6 classes = mainkmeans(data_res,n_cluster) usual_norms = [0.0]*n_cluster for q in range(n_cluster): b = np.array([unbinarize(classes[r]) == q for r in range(len(classes))]) usual_norms[q] = np.mean(norms[b]) data_svr = np.array([np.append(norms[s],usual_norms[unbinarize(classes[s])]) for s in range(len(norms))]) sizes = data_svr[:,0]/k clas = data_svr[:,1]/k_f n_features = i cv_sizes,dummy = createTrainList(sizes,n_features,0) cv_data = np.array([np.append(cv_sizes[p],clas[p+1]) for p in range(len(cv_sizes)-1)]) c_train = cv_data[:len(cv_data) - val] X_train,y_train = createTrainList(c_train,1,1) X_train = np.squeeze(X_train,1) y_train = np.squeeze(y_train,1) y_train = y_train[:,n_features-1] X_pred = np.append(sizes[len(sizes)-i:],usual_norms[unbinarize(classes_test)]) clf = rf(n_estimators = 100, max_depth = j) clf = clf.fit(X_train,y_train) prediction = clf.predict(X_pred)*k return prediction[0]
def main(): onehot = lambda val, size: [1 if i == val else 0 for i in range(size)] correct = lambda t, p: sum(x == y for x, y in zip(t, p)) np.random.seed(20180109) select = None (X_train, y_train) = read('train') (X_valid, y_valid) = read('validation') (X_test, y_test) = read('test') print('train:', X_train.shape, y_train.shape) print('test:', X_test.shape, y_test.shape) print('valid:', X_valid.shape, y_valid.shape) model_name, model = 'rf', rf(n_jobs=-1, n_estimators=500) model.fit(X_train, y_train) pred = model.predict(X_test) total = len(pred) acc_test = correct(pred, y_test) / total pred = model.predict(X_valid) total = len(pred) acc_valid = correct(pred, y_valid) / total pred = model.predict(X_train) total = len(pred) acc_train = correct(pred, y_train) / total with open('result.txt', 'a') as fout: fout.write(' '.join(sys.argv[1:]) + '\ttrain: {:.4f}\ttest: {:.4f}\tvalid: {:.4f}\n'.format( acc_train, acc_test, acc_valid))
def train(): airbnb_data_path = '/Users/Derek/Desktop/3A/436/project/flaskapp/data/AB_NYC_2019.csv' airbnb_data = pd.read_csv(airbnb_data_path) # set prediction target y = airbnb_data.price # set prediction features global airbnb_features airbnb_features = ['latitude', 'longitude'] X = airbnb_data[airbnb_features] # split data into training and validation sets train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) global forest_model forest_model = rf(random_state=1) forest_model.fit(train_X, train_y) airbnb_price_preds = forest_model.predict(val_X) # persist trained model joblib.dump(forest_model, 'airbnb_model.pkl') model_mae = mean_absolute_error(val_y, airbnb_price_preds) return "Model training complete with Mean Absolute Error = " + str( model_mae)
def learn_rf(dat_train, lbl_train, estimators = 255, min_samples_leaf = 1): rf_pix = rf(n_estimators = estimators, min_samples_leaf = min_samples_leaf, n_jobs = -1) rf_pix.fit( dat_train.astype(np.float32), lbl_train.astype(np.uint8 ) ) return rf_pix
def __init__(self, trainCorpusObject, devCorpusObject): spacyTrain = self.addSpacyDoc(trainCorpusObject) spacyDev = self.addSpacyDoc(devCorpusObject) dfTrain = self.createDF(spacyTrain) dfTest = self.createDF(spacyDev) dfClass = [] for corpusParah in trainCorpusObject.corpus: dfClass.append(corpusParah.score) randForest = rf(n_estimators=201, n_jobs=2, random_state=0) supportvm = svm.SVC(decision_function_shape='ovo') adaboostClassifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200) #to save the models rfp = pk.dumps(randForest) sp = pk.dumps(supportvm) ap = pk.dumps(adaboostClassifier) randForest.fit(dfTrain, dfClass) supportvm.fit(dfTrain, dfClass) adaboostClassifier.fit(dfTrain, dfClass) devrandForest = [] devsupportvm = [] devaDaboost = [] for n in randForest.predict(dfTest): devrandForest.append(n) for prediction in supportvm.predict(dfTest): devsupportvm.append(prediction) for prediction in adaboostClassifier.predict(dfTest): devaDaboost.append(prediction) linenum = 1 index = 0 file = open("data/prediction.txt", 'w+') file.write("id Gold Tag\n") for corpusParah in devCorpusObject.corpus: curind = index + 1 print((str)(curind) + " " + corpusParah.score + " " + devrandForest[index] + " " + devsupportvm[index] + " " + devaDaboost[index]) newLine = "s_" + str(linenum) + "\t" + str( self.maxNumber(devrandForest[index], devsupportvm[index], devaDaboost[index])) if (linenum == len(devrandForest)): file.write(newLine) else: file.write(newLine + "\n") linenum = linenum + 1 index = index + 1 file.close()
def classifierFunc(s): if s == 'SVC': return SVC() elif s == 'rf': return rf(n_estimators=50) elif s == 'SVC': return SVC(kernel='linear') elif s == 'rbf': return SVC(kernel='rbf', C=0.60, gamma=0.1675, probability=True)
def __init__(self, features, labels, method="rf"): self.method = method if self.method == "rf": features = np.delete(features, np.where(np.isnan(features))[0], axis=0) labels = np.delete(labels, np.where(np.isnan(labels))[0], axis=0) self.featuresNorm = prep.scale(features) self.labels = labels self.estimator = rf(n_estimators=80, max_depth=2, random_state=0)
def train(): """ Endpoint used to train the model, this is only one way to do this, a more common approach is that you might need to traqin your model and then serialise it or persist it somewhere else""" # using random forest as an example # can do the training separately and just update the pickles from sklearn.ensemble import RandomForestClassifier as rf df = pd.read_csv(training_data) df_ = df[include] categoricals = [] # going to one-hot encode categorical variables for col, col_type in df_.dtypes.items(): if col_type == "O": categoricals.append(col) else: df_[col].fillna( 0, inplace=True) # fill NA's with 0 for ints/floats, too generic # get_dummies effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # capture a list of columns that will be used for prediction global model_columns model_columns = list(x.columns) if not os.path.exists("model"): os.makedirs("model") with open(model_columns_filename, "wb") as f: pickle.dump(model_columns, f) global clf clf = rf() start = time.time() clf.fit(x, y) with open(model_filename, "wb") as f: pickle.dump(clf, f) message1 = f"Trained in {time.time()-start} seconds" message2 = f"Model training score: {clf.score(x,y)}" return_message = "Success. \n{0}. \n{1}.".format(message1, message2) return return_message
def chooseClassification(name): print "Choosen classfier:",name return { 'NB': GaussianNB(), 'ADA': adaBoost(n_estimators=50), 'RF': rf(n_estimators = 100), 'KNN': knn(n_neighbors=15, p=1), 'SVM': svm.SVC(kernel='rbf', probability=True), 'BAG':BaggingClassifier(n_estimators = 30)#base_estimator=knn(), #bootstrap=True, #bootstrap_features=True, #oob_score=True, #max_features = 10, #max_samples = 100), }.get(name, GaussianNB()) # default Gaussian Naive Bayes
def chooseClassification(name): print "Choosen classfier:", name return { 'NB': GaussianNB(), 'ADA': adaBoost(n_estimators=2), 'RF': rf(n_estimators=7), 'KNN': knn(n_neighbors=15, p=1), 'SVM': svm.SVC(C=0.01, kernel='rbf', probability=True), 'BAG': BaggingClassifier(n_estimators=7) #base_estimator=knn(), #bootstrap=True, #bootstrap_features=True, #oob_score=True, #max_features = 10, #max_samples = 100), }.get(name, GaussianNB()) # default Gaussian Naive Bayes
def do_training(df: DataFrame, model_id: str = None) -> str: # using random forest as an example # can do the training separately and just update the pickles from sklearn.ensemble import RandomForestClassifier as rf df_ = df[include] categoricals = [] # going to one-hot encode categorical variables for col, col_type in df_.dtypes.items(): if col_type == 'O': categoricals.append(col) else: df_[col].fillna( 0, inplace=True) # fill NA's with 0 for ints/floats, too generic # get_dummies effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] if not model_id: model_id = model_default_name # capture a list of columns that will be used for prediction model_columns_file_name = '{}/{}_columns.pkl'.format( model_directory, model_id) with lock: model_columns[model_id] = list(x.columns) joblib.dump(model_columns[model_id], model_columns_file_name) # build classifier with lock: clf[model_id] = rf() start = time.time() clf[model_id].fit(x, y) out_file = '{}/{}.pkl'.format(model_directory, model_id) joblib.dump(clf[model_id], out_file) message1 = 'Trained in %.5f seconds' % (time.time() - start) message2 = 'Model training score: %s' % clf[model_id].score(x, y) return_message = 'Success. \n{0}. \n{1}.'.format(message1, message2) print(return_message) return return_message
def train(): # usando random forest como exemplo # pode fazer o treinamento separadamente e apenas atualizar os picles from sklearn.ensemble import RandomForestClassifier as rf df = pd.read_csv(training_data) df_ = df[include] # Codificando variáveis categóricas categoricals = [] for col, col_type in df_.dtypes.iteritems(): if col_type == 'O': categoricals.append(col) else: # Preenchendo possíveis valores com 0 df_[col].fillna(0, inplace=True) # get_dummies efetivamente cria variáveis codificadas de uma só vez df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # Capturando a lista de colunas que serão usadas para previsão global model_columns model_columns = list(x.columns) joblib.dump(model_columns, model_columns_file_name) global clf clf = rf() start = time.time() clf.fit(x, y) # Tempo do treino print('\033[1;34m' + 'Treinado em %.1f segundos' % (time.time() - start) + '\033[0;0m') # Acurácia: em média 91% nos testes executados print('\033[1;34m' + 'Acurácia do treinamento do modelo: %s' % clf.score(x, y) + '\033[0;0m') joblib.dump(clf, model_file_name) return 'Modelo Treinado com successo!'
def train(): if not os.path.exists(model_directory): os.makedirs(model_directory) # using random forest as an example # can do the training separately and just update the pickles from sklearn.ensemble import RandomForestClassifier as rf df = pd.read_csv(training_data) df_ = df[include] categoricals = [] # going to one-hot encode categorical variables for col, col_type in df_.dtypes.items(): if col_type == 'O': categoricals.append(col) else: # fill NA's with 0 for ints/floats, too generic df_[col].fillna(0, inplace=True) # get_dummies effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # capture a list of columns that will be used for prediction global model_columns model_columns = list(x.columns) joblib.dump(model_columns, model_columns_file_name) global clf clf = rf() start = time.time() clf.fit(x, y) joblib.dump(clf, model_file_name) response = make_response({ 'Message': 'Success', 'Trained in seconds': (time.time() - start), 'Model training score': clf.score(x, y), 'max_features': str(clf.max_features), }) response.headers['Content-Type'] = 'application/json; charset=utf-8' return response
def main(): #Read the data in df = reader() #split the dataset trainX, testX, trainY, testY = splitDataset(df, .7) #Build the random forest classifier, and fit it on the training data clf = rf(n_estimators=30).fit(trainX, trainY) #Use the model to predict on the test data predictions = clf.predict(testX) #For the first five observations, print the actual and predicted values of the test data for i in range(0, 5): print("Actual outcome :: {} and Predicted outcome :: {}".format( list(testY)[i], predictions[i])) #Various classification metrics, such as accuracy, a confusion matrix for false and true positives and negatives, and roc score print("Train Accuracy :: ", accuracy_score(trainY, clf.predict(trainX))) print("Test Accuracy :: ", accuracy_score(testY, predictions)) print("Confusion matrix :: ", confusion_matrix(testY, predictions)) print("ROC AUC :: ", roc_auc_score(testY, predictions)) fpr, tpr, thr = roc_curve( testY, predictions) #false positive, true positive, threshold # Different way of calculating the AUC, helps with the plot rocAuc = auc(fpr, tpr) # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % rocAuc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() #Pickling the model for later use joblib.dump(clf, 'lmmodel.pkl')
def train(): # using random forest as an example # can do the training separately and just update the pickles from sklearn.ensemble import RandomForestClassifier as rf df = pd.read_csv(training_data) df_ = df[include] categoricals = [] # going to one-hot encode categorical variables for col, col_type in df_.dtypes.iteritems(): if col_type == 'O': categoricals.append(col) else: df_[col].fillna(0, inplace=True) # fill NA's with 0 for ints/floats, too generic # get_dummies effectively creates one-hot encoded variables df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True) x = df_ohe[df_ohe.columns.difference([dependent_variable])] y = df_ohe[dependent_variable] # capture a list of columns that will be used for prediction global model_columns model_columns = list(x.columns) joblib.dump(model_columns, model_columns_file_name) global clf clf = rf() start = time.time() clf.fit(x, y) print 'Trained in %.1f seconds' % (time.time() - start) print 'Model training score: %s' % clf.score(x, y) joblib.dump(clf, model_file_name) return 'Success'
mydict[hour]=1 countdf=df.from_dict(mydict,orient='index') countdf['count']=countdf[0] countdf=countdf['count'] countdf=countdf.sort_index().values # In[89]: target=countdf # In[90]: from sklearn.ensemble import RandomForestRegressor as rf model=rf(n_estimators=100,n_jobs=-1) # In[91]: model.fit(X=data[:552,:],y=target[:552]) # In[92]: predicts=model.predict(data[552:]) # In[93]: result =df({ 'target' :target[552:],'predict' : predicts})
def pickleSave(obj,name): pkl_file = open(name, 'wb') cPickle.dump(obj,pkl_file,protocol=-1) pkl_file.close() def pickleLoad(name): pkl_file = open(name, 'rb') obj=cPickle.load(pkl_file) pkl_file.close() return obj print "Validation data" fitForest=False if fitForest: r=rf(1000,n_jobs=5) r.fit(trainX,trainY) pickleSave(r,"kaggleDiabeticRetinopathyCompetitionModelFiles/modelAverage.forest.pickle") else: r=pickleLoad("kaggleDiabeticRetinopathyCompetitionModelFiles/modelAverage.forest.pickle") print "accuracy", numpy.mean(r.predict(validationX)==validationY) print "mse", numpy.mean((r.predict(validationX)-validationY)**2) a=(r.predict_proba(validationX)*numpy.arange(5).reshape((1,5))).sum(1) ######################################################################################### testData={"left": {}, "right": {}} for f in fl: for l in open(f+".test"): a=l.split("/")[-1].split(",") b=a[0].split(".")[0].split("_")
# prediction_weekday = prediction_weekday.set_index("datetime") # prediction=prediction_weekend; # prediction=prediction.append(prediction_weekday) # prediction=prediction.sort_index() # prediction['datetime']=test_factor['datetime'] # prediction=prediction.set_index('datetime') ###################################################################################################################################### ###################################################################################################################################### #randomForest modele ###################################################################################################################################### ###################################################################################################################################### model = rf(50) formula = "count ~ season + weather + temp + windspeed + humidity + holiday + workingday + hour + Sunday + hot" Y_train = train_factor['count'] X_train = train_factor.drop(['datetime', 'count','casual','registered','date','day'],1) X_test = test_factor.drop(['datetime','date','day'], 1) model.fit(X_train,Y_train) prediction=model.predict(X_test) prediction[prediction<0]=0 prediction=pd.DataFrame(prediction) prediction.columns=['count'] prediction['datetime']=test_factor['datetime'] prediction=prediction.set_index("datetime") #write the submission prediction.to_csv("Résultats/randomForest50_adrien.csv")
# In[70]: data['start station cluster']=map(lambda x: station[station['id']==x]['cluster'],data['start station id']) data['end station cluster']=map(lambda x: station[station['id']==x]['cluster'],data['end station id']) # In[62]: station[station['id']==147]['cluster'] # In[75]: from sklearn.ensemble import RandomForestClassifier as rf rf1=rf(n_estimators=10,n_jobs=4) # In[76]: rf1.fit(train,target) # In[77]: rf1 # In[79]:
from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis nn = Classifier( layers=[Layer("Softmax")], learning_rate=0.001, n_iter=25) nn.fit(train_normalized, target) y_predict = nn.predict(test_normalized) ad_fit = ad(n_estimators = 10).fit(X_train,y_train) y_pred = ad_fit.predict(X_test) ad_acc = accuracy_score(y_pred, y_test) #0.60 y_pred = rf().fit(X_train,y_train).predict(X_test) rf_acc = accuracy_score(y_pred, y_test) #0.59 gnb = GaussianNB() y_pred = gnb.fit(X_train, y_train).predict(X_test) gnb_acc = accuracy_score(y_pred, y_test) #0.075 (extremely low) svc = svm.SVC() svc = svc.fit(X_train, y_train) y_pred = svc.predict(X_test) svc_acc = accuracy_score(y_pred, y_test) ldaC = LDA().fit(X_train, y_train) y_pred = ldaC.predict(X_test) lda_acc = accuracy_score(y_pred, y_test) #0.62
p_earned = CPS_dataset.wsal_val + CPS_dataset.semp_val + CPS_dataset.frse_val #individual earned income CPS_dataset['p_earned'] = p_earned #disabled (check reg if categorical or binary is better after the sum) CPS_dataset['disability'] = np.zeros(len(CPS_dataset)) CPS_dataset.disability = np.where(CPS_dataset.pedisdrs == 1, 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisear == 1, 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pediseye == 1, 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisout == 1, 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 1, 1, CPS_dataset.disability) CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 1, 1, CPS_dataset.disability) Rf = rf(n_estimators = 200) # Creating Random Forest CPS_use = CPS_dataset.drop('peridnum', 1) #Splitting data into training and test sets train = CPS_use.sample(frac=0.8, random_state=1) train_x = train.copy() train_x = train_x.drop(['WIC_child', 'WIC_woman','mig_reg', 'mon', 'mig_div', 'migsame', 'm5g_div', 'm5g_reg','hrnumwic','wicyn', 'mig_reg', 'WIC_infant', 'hrwicyn','pothval', 'hothval', 'fothval','fam_unearned_income','unearned_income', 'hunits', 'hhpos', 'h_seq', 'hrecord', 'ph_seq', 'hsup_wgt', 'fsup_wgt', 'marsupwt','h_idnum1'],1) test_x = CPS_use.loc[~CPS_use.index.isin(train_x.index)] test_y = test_x['WIC_infant'] test_x = test_x.drop(['WIC_child', 'WIC_woman','mig_reg', 'mon', 'mig_div', 'migsame', 'm5g_div', 'm5g_reg','hrnumwic','wicyn', 'mig_reg', 'WIC_infant', 'hrwicyn','pothval', 'hothval', 'fothval','fam_unearned_income','unearned_income', 'hunits', 'hhpos', 'h_seq', 'hrecord', 'ph_seq', 'hsup_wgt', 'fsup_wgt', 'marsupwt','h_idnum1'],1)
for i in xrange(2): for j in xrange(3): train_set.loc[ (train_set["Age"].isnull()) & (train_set.Gender==i ) & (train_set.Pclass==j+1 ) ,"AgeFill" ] =median_age_train[i,j] for i in xrange(2): for j in xrange(3): test_set.loc[ (test_set["Age"].isnull()) & (test_set.Gender==i ) & (test_set.Pclass==j+1 ) ,"AgeFill" ] =median_age_test[i,j] # nehm mal nur nun numerisch mit train_set=train_set.drop(["PassengerId","Name","Age","Sex","Ticket","Cabin","Embarked","Fare"], axis=1) ids = test_set["PassengerId"].values test_set=test_set.drop(["PassengerId","Name","Age","Sex","Ticket","Cabin","Embarked","Fare"], axis=1) train_set =train_set.values test_set =test_set.values forest =rf(n_estimators=100) forest.fit(train_set[0::,1::],train_set[0::,0]) output =forest.predict(test_set).astype(int) prediction_file = open("myfirstforest.csv","wb") open_file_object = csv.writer(prediction_file) open_file_object.writerow(["PassengerId","Survived"]) open_file_object.writerows(zip(ids,output)) prediction_file.close()
train_data = train_data.drop(['url'], axis=1) #remove 'url' information. train_data = train_data.drop(['timedelta'], axis=1) #remove 'url' information. # train_data= train_data[train_data["shares"]<40000] X = np.array(train_data.drop(['shares'], axis=1)) y = np.array(train_data['shares']) #This is the target X = preprocessing.scale(X) XTrain = X[:N,:] #use the first N samples for training yTrain = y[:N] XVal = X[N:,:] #use the rests for validation yVal = y[N:] Xtest = test_data.values Xtest = preprocessing.scale(Xtest) # print type(XTrain) matrix for i in [10, 20, 50, 100, 200]: model = rf(n_estimators = i, n_jobs = 4) model.fit(XTrain,yTrain) training = model.predict(XTrain) validation = model.predict(XVal) print "RF" + str(i) print "Training error ", np.mean(np.abs(yTrain - training)) print "Validation error ", np.mean(np.abs(yVal - validation)) print model.feature_importances_ result = model.predict(Xtest) np.savetxt('result/resultRF' + str(i) + '.txt', result)
# Création de la colonne contenant l'heure de la journée tabtrain['hour']=0 for i in range(len(tabtrain)) : tabtrain['hour'][i] = tabtrain['date'][i].hour tabtrain['hour']=tabtrain['hour'].astype('category') tabtest['hour']=0 for i in range(len(tabtest)) : tabtest['hour'][i] = tabtest['date'][i].hour tabtest['hour']=tabtest['hour'].astype('category') # Tableaux d'entrainement y_train = tabtrain['count'] x_train = tabtrain.drop(['datetime','count','casual','registered','date'],1) # On forme les tableaux des résultats x_test = tabtest.drop(['datetime','date'],1) model = rf(100) model.fit(x_train, y_train) y_test = model.predict(x_test) y_test = pa.DataFrame(y_test) y_test.index = tabtest['datetime'] y_test.to_csv('csv/rf_matthias_1.csv')
X_train = transform_features(X_train_A) - transform_features(X_train_B) model = linear_model.LogisticRegression(fit_intercept=False) model.fit(X_train,y_train) ## compute AuC score on the training data using Logistic Regression / Random Forest ## # Logistic Regression p_train = model.predict_proba(X_train) p_train = p_train[:,1:2] auc = auc_score(y_train,p_train) print('AUC score = ', round(auc,3),'Using Logistic Regression' ) scores_lr = cv.cross_val_score(model, X_train,y_train,cv=5).mean() # Random forest modelrf = rf(n_estimators=300,max_depth=6,max_features='auto',oob_score=True).fit(X_train,y_train) p_train2 = modelrf.predict_proba(X_train) p_train2 = p_train2[:,1:2] auc2 = auc_score(y_train,p_train2) print('AUC score = ', round(auc2,5),'Using Random Forest' ) scores_rf = cv.cross_val_score(modelrf, X_train,y_train,cv=5).mean() ########################### # LOADING TEST DATA ########################### #ignore the test header testfile = open('test.csv') testfile.next()
if totalTime < 0: continue try: lst = np.array([int(row[3]), int(row[9]), totalTime]) exit.append(np.array(status[row[4]])) companies.append(lst) except ValueError: continue c = np.array(companies)[0:1700] e = np.array(exit)[0:1700] c1 = np.array(companies)[1700:] e1 = np.array(exit)[1700:] tree = rf(criterion='entropy', bootstrap=False, max_depth=5) tree.fit(c, e) print tree.score(c, e) print tree.score(c1, e1) testBase = "2021-03" testPattern = "%Y-%m" testCurrent = int(time.mktime(time.strptime(testBase, testPattern))) testBase = "2014-01" testPattern = "%Y-%m" testStart = int(time.mktime(time.strptime(testBase, testPattern))) test_point1 = np.array([2500000, 1, testCurrent - testStart]).reshape(1, -1) test_point2 = np.array([3200000, 2, testCurrent - testStart]).reshape(1, -1) test_point3 = np.array([40000000, 3, testCurrent - testStart]).reshape(1, -1) print "status = {'acquired' : 0, 'ipo' : 1,'operating' : 2, 'closed' : 3}:\n " + ' , '.join(list(map(str, tree.predict_proba(test_point1)[0])))
__author__ = 'Gabriel' import sys sys.path.append('../') from sklearn.ensemble import RandomForestClassifier as rf import opticalCharacterManipulation import time from features import * import numpy (X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestRawData() # (X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestFeaturesData(sidePoints) # (X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestFeaturesData(sidePoints, gravityPoints) start = time.time() # Generate model N = 200 score = 0 for i in range(0, N) : classifier = rf() classifier.fit(X, numpy.ravel(Y)) score += classifier.score(X_test,Y_test) score = round(100 * score / N, 2) print("Score: " + str(score) + " %") print("Elapsed time : " + str(time.time() - start))
# Création de la colonne contenant l'heure de la journée tabtrain['hour']=0 for i in range(len(tabtrain)) : tabtrain['hour'][i] = tabtrain['date'][i].hour tabtrain['hour']=tabtrain['hour'].astype('category') tabtest['hour']=0 for i in range(len(tabtest)) : tabtest['hour'][i] = tabtest['date'][i].hour tabtest['hour']=tabtest['hour'].astype('category') # Tableaux d'entrainement y_train = tabtrain['count'] x_train = tabtrain.drop(['datetime','count','casual','registered','date'],1) # On forme les tableaux des résultats x_test = tabtest.drop(['datetime','date'],1) model = rf(200) model.fit(x_train, y_train) y_test = model.predict(x_test) y_test = pa.DataFrame(y_test) y_test.index = tabtest['datetime'] y_test.to_csv('csv/rf_matthias_1.csv')
tree_test.fit(records_train_x_final, records_train_y) records_test_y_pred = tree_test.predict(records_test_x_final) fpr, tpr, thresholds = roc_curve(records_test_y, records_test_y_pred) plot_roc(fpr, tpr, "DT Grid Search (Test Set)") # Increased again to 0.7549, showing the grid search is helpful, but not # optimal. export_graphviz(tree_test, out_file='tree_test.dot') ################################## # RANDOM FOREST # ################################## # Lastly, I will look at the random forest model, and see how well it predicts # the validation set. rforest = rf(n_estimators=100, max_features='auto', verbose=1, n_jobs=1) rforest.fit(records_train_x_final, records_train_y) rf_probabilities = rforest.predict_proba(records_valid_x_final) roc_auc = roc_auc_score(records_valid_y, rf_probabilities[:,1] ) fpr, tpr, thresholds = roc_curve(records_valid_y, rf_probabilities[:,1]) plot_roc(fpr, tpr, "Random Forest")ppyp