예제 #1
0
    def getRandomForestClf(self, X, Y, param_list):
        clfName = "Random_Forest"
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True)
            
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = 8
            tmpHighDepth = 30
            
            
            param_dist = {
                          "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                          "max_features": sp_randf(0,1),
                          "min_samples_split": sp_randint(1, 11),
                          "min_samples_leaf": sp_randint(1, 11),
                          "criterion": ["gini", "entropy"], 
                          "n_estimators" : sp_randint(5, 12),
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
        
        else:    

            if not param_list is None:
                clf = rf()
                clf.set_params(**param_list)
            clf.fit(X,Y)    
            
        return clf
예제 #2
0
def train():
    # Training the data using Random Forest algorithm
    from sklearn.ensemble import RandomForestRegressor as rf
    df = pd.read_csv(training_data)
    df_ = df[include]
    df_ = df_.dropna()
    # One-hot encoding categorical variables
    categoricals = []
    for col, col_type in df_.dtypes.iteritems():
        if col_type == 'O':
            categoricals.append(col)
        else:
            # Fill NaNs with 0
            df_[col].fillna(0, inplace=True)
    # get_dummies() effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)
    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]
    # Capture a list of columns that will be used for prediction
    global model_columns
    model_columns = list(x.columns)
    joblib.dump(model_columns, model_columns_file_name)
    global clf
    clf = rf()
    clf.fit(x, y)
    joblib.dump(clf, model_file_name)
    global predicted_value
    predicted_value = -1
    return redirect(url_for('main'))
예제 #3
0
def train(df):
    df_ = df[include]
    print("Training data sample:\n", df_.head())
    categoricals = []  # going to one-hot encode categorical variables

    for col, col_type in df_.dtypes.items():
        if col_type == 'O':
            categoricals.append(col)
        else:
            df_[col].fillna(
                0,
                inplace=True)  # fill NA's with 0 for ints/floats, too generic

    # get_dummies effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    # capture a list of columns that will be used for prediction
    model_columns = list(x.columns)
    print("Model columns are", model_columns)

    model = rf()
    start = time.time()
    model.fit(x, y)
    print('Trained in %.1f seconds' % (time.time() - start))
    print('Model training score: %s' % model.score(x, y))

    return model_columns, model
예제 #4
0
    def buildModel(self):
        from sklearn.linear_model import LogisticRegression
        nums = [str(i) for i in range(8)]
        df = pd.read_csv('data/explodedNoOpenings.csv')
        numbers = [str(i) for i in range(8)]
        cols = [
            col for col in df.columns if col not in [
                'white_rating', 'opening', 'black_rating', 'id',
                'victory_type', "Unnamed: 0"
            ]
        ]
        print(cols)
        cols = [col for col in cols if col[0] not in numbers]

        features_to_concat = [df]
        features_to_concat.append(pd.get_dummies(df['winner'],
                                                 prefix='winner'))
        df = pd.concat(features_to_concat, axis=1)
        cols = [
            col for col in cols
            if col not in ['winner', 'target', 'winner_white', 'winner_black']
        ]
        df = pd.concat(features_to_concat, axis=1)
        print(df[cols].shape)
        clf = rf().fit(df[cols], df['winner_white'])
        joblib.dump(clf, 'data/model.pikl')
        import json
        f = open('data/features', 'w')
        f.write(json.dumps({'features': cols}))
        f.close()
        print(cols)
예제 #5
0
def save_model():
    # 读取数据
    df = pd.read_csv('train.csv')
    include = ['Age', 'Sex', 'Embarked', 'Survived']
    df_ = df[include]  # only using 4 variables

    categoricals = []
    for col, col_type in df_.dtypes.iteritems():
        if col_type == 'O':
            categoricals.append(col)
        else:
            df_[col].fillna(0, inplace=True)
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    # using a random forest classifier (can be any classifier)

    dependent_variable = 'Survived'
    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    print(x.head())
    y = df_ohe[dependent_variable]
    clf = rf()
    clf.fit(x, y)

    # 保存模型
    dump(clf, model_file_name)

    # 保存模型列
    global model_columns
    model_columns = list(x.columns)
    dump(model_columns, model_columns_file_name)
예제 #6
0
def get_model(choice='lr', class_weight=None):
    if choice == 'svc':
        model = svc(verbose=1, class_weight=class_weight, n_jobs=-1)

    elif choice == 'lsvc':
        model = lsvc(class_weight=class_weight, n_jobs=-1)
    elif choice == 'knn':
        model = KNeighborsClassifier()
    elif choice == 'msvm':
        model = MulticlassSVM(C=0.1,
                              tol=0.01,
                              max_iter=100,
                              random_state=0,
                              verbose=1)

    elif choice == 'gnb':
        model = gnb(class_weight=class_weight)

    elif choice == 'gpc':
        model = gpc(class_weight=class_weight)
    elif choice == 'sgdc':
        model = sgdc(class_weight=class_weight)

    elif choice == 'rf':
        model = rf(class_weight=class_weight)
#   elif choice == 'vw':
#         model = vw()
    else:
        model = lr(class_weight=class_weight)
    return model
예제 #7
0
def genAcc(train, trainLabel, test, testLabel, features):
    clf = rf(n_estimators=100)
    train = train[:, features]
    test = test[:, features]
    clf.fit(train, trainLabel)
    val = clf.score(test, testLabel)
    return val
예제 #8
0
def treenorms(classes,classes_test,norms,n_cluster,data_res):
    i=1
    j=4
    k=41
    k_f=6
    classes = mainkmeans(data_res,n_cluster)
    usual_norms = [0.0]*n_cluster
    for q in range(n_cluster):
        b = np.array([unbinarize(classes[r]) == q for r in range(len(classes))])
        usual_norms[q] = np.mean(norms[b])            
    
    data_svr = np.array([np.append(norms[s],usual_norms[unbinarize(classes[s])]) for s in range(len(norms))])
    
    sizes = data_svr[:,0]/k
    clas = data_svr[:,1]/k_f
    n_features = i
    cv_sizes,dummy = createTrainList(sizes,n_features,0)
    cv_data = np.array([np.append(cv_sizes[p],clas[p+1]) for p in range(len(cv_sizes)-1)])
    
    c_train = cv_data[:len(cv_data) - val]
    X_train,y_train = createTrainList(c_train,1,1)
    X_train = np.squeeze(X_train,1)
    y_train = np.squeeze(y_train,1)
    y_train = y_train[:,n_features-1]
    
    X_pred = np.append(sizes[len(sizes)-i:],usual_norms[unbinarize(classes_test)])
    clf = rf(n_estimators = 100, max_depth = j)
    clf = clf.fit(X_train,y_train)
    prediction = clf.predict(X_pred)*k
    
    return prediction[0]
예제 #9
0
def main():
    onehot = lambda val, size: [1 if i == val else 0 for i in range(size)]
    correct = lambda t, p: sum(x == y for x, y in zip(t, p))

    np.random.seed(20180109)

    select = None
    (X_train, y_train) = read('train')
    (X_valid, y_valid) = read('validation')
    (X_test, y_test) = read('test')
    print('train:', X_train.shape, y_train.shape)
    print('test:', X_test.shape, y_test.shape)
    print('valid:', X_valid.shape, y_valid.shape)

    model_name, model = 'rf', rf(n_jobs=-1, n_estimators=500)

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    total = len(pred)
    acc_test = correct(pred, y_test) / total

    pred = model.predict(X_valid)
    total = len(pred)
    acc_valid = correct(pred, y_valid) / total

    pred = model.predict(X_train)
    total = len(pred)
    acc_train = correct(pred, y_train) / total

    with open('result.txt', 'a') as fout:
        fout.write(' '.join(sys.argv[1:]) + '\ttrain: {:.4f}\ttest: {:.4f}\tvalid: {:.4f}\n'.format(
            acc_train, acc_test, acc_valid))
예제 #10
0
def train():
    airbnb_data_path = '/Users/Derek/Desktop/3A/436/project/flaskapp/data/AB_NYC_2019.csv'
    airbnb_data = pd.read_csv(airbnb_data_path)

    # set prediction target
    y = airbnb_data.price

    # set prediction features
    global airbnb_features
    airbnb_features = ['latitude', 'longitude']
    X = airbnb_data[airbnb_features]

    # split data into training and validation sets
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

    global forest_model
    forest_model = rf(random_state=1)
    forest_model.fit(train_X, train_y)
    airbnb_price_preds = forest_model.predict(val_X)

    # persist trained model
    joblib.dump(forest_model, 'airbnb_model.pkl')

    model_mae = mean_absolute_error(val_y, airbnb_price_preds)
    return "Model training complete with Mean Absolute Error = " + str(
        model_mae)
def learn_rf(dat_train, lbl_train, estimators = 255, min_samples_leaf = 1):
    rf_pix = rf(n_estimators = estimators, min_samples_leaf = min_samples_leaf, n_jobs = -1)
    rf_pix.fit(
            dat_train.astype(np.float32),
            lbl_train.astype(np.uint8  )
            )
    return rf_pix
예제 #12
0
    def __init__(self, trainCorpusObject, devCorpusObject):

        spacyTrain = self.addSpacyDoc(trainCorpusObject)
        spacyDev = self.addSpacyDoc(devCorpusObject)

        dfTrain = self.createDF(spacyTrain)
        dfTest = self.createDF(spacyDev)

        dfClass = []
        for corpusParah in trainCorpusObject.corpus:
            dfClass.append(corpusParah.score)

        randForest = rf(n_estimators=201, n_jobs=2, random_state=0)
        supportvm = svm.SVC(decision_function_shape='ovo')
        adaboostClassifier = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=1), n_estimators=200)

        #to save the models
        rfp = pk.dumps(randForest)
        sp = pk.dumps(supportvm)
        ap = pk.dumps(adaboostClassifier)

        randForest.fit(dfTrain, dfClass)
        supportvm.fit(dfTrain, dfClass)
        adaboostClassifier.fit(dfTrain, dfClass)

        devrandForest = []
        devsupportvm = []
        devaDaboost = []

        for n in randForest.predict(dfTest):
            devrandForest.append(n)

        for prediction in supportvm.predict(dfTest):
            devsupportvm.append(prediction)

        for prediction in adaboostClassifier.predict(dfTest):
            devaDaboost.append(prediction)

        linenum = 1
        index = 0
        file = open("data/prediction.txt", 'w+')
        file.write("id	Gold Tag\n")
        for corpusParah in devCorpusObject.corpus:
            curind = index + 1
            print((str)(curind) + " " + corpusParah.score + " " +
                  devrandForest[index] + " " + devsupportvm[index] + " " +
                  devaDaboost[index])
            newLine = "s_" + str(linenum) + "\t" + str(
                self.maxNumber(devrandForest[index], devsupportvm[index],
                               devaDaboost[index]))
            if (linenum == len(devrandForest)):
                file.write(newLine)
            else:
                file.write(newLine + "\n")
            linenum = linenum + 1
            index = index + 1
        file.close()
예제 #13
0
def classifierFunc(s):
    if s == 'SVC':
        return SVC()
    elif s == 'rf':
        return rf(n_estimators=50)
    elif s == 'SVC':
        return SVC(kernel='linear')
    elif s == 'rbf':
        return SVC(kernel='rbf', C=0.60, gamma=0.1675, probability=True)
예제 #14
0
 def __init__(self, features, labels, method="rf"):
     self.method = method
     if self.method == "rf":
         features = np.delete(features,
                              np.where(np.isnan(features))[0],
                              axis=0)
         labels = np.delete(labels, np.where(np.isnan(labels))[0], axis=0)
         self.featuresNorm = prep.scale(features)
         self.labels = labels
         self.estimator = rf(n_estimators=80, max_depth=2, random_state=0)
예제 #15
0
def train():
    """ 
    Endpoint used to train the model, this is only one way to do this, 
    a more common approach is that you might need to traqin your model and then 
    serialise it or persist it somewhere else"""

    # using random forest as an example
    # can do the training separately and just update the pickles
    from sklearn.ensemble import RandomForestClassifier as rf

    df = pd.read_csv(training_data)
    df_ = df[include]

    categoricals = []  # going to one-hot encode categorical variables

    for col, col_type in df_.dtypes.items():
        if col_type == "O":
            categoricals.append(col)
        else:
            df_[col].fillna(
                0,
                inplace=True)  # fill NA's with 0 for ints/floats, too generic

    # get_dummies effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    # capture a list of columns that will be used for prediction
    global model_columns
    model_columns = list(x.columns)

    if not os.path.exists("model"):
        os.makedirs("model")

    with open(model_columns_filename, "wb") as f:
        pickle.dump(model_columns, f)

    global clf
    clf = rf()
    start = time.time()
    clf.fit(x, y)

    with open(model_filename, "wb") as f:
        pickle.dump(clf, f)

    message1 = f"Trained in {time.time()-start} seconds"
    message2 = f"Model training score: {clf.score(x,y)}"
    return_message = "Success. \n{0}. \n{1}.".format(message1, message2)
    return return_message
예제 #16
0
def chooseClassification(name):
    print "Choosen classfier:",name
    return {
        'NB': GaussianNB(),
        'ADA': adaBoost(n_estimators=50),
        'RF': rf(n_estimators = 100),
        'KNN': knn(n_neighbors=15, p=1),
        'SVM': svm.SVC(kernel='rbf', probability=True),
        'BAG':BaggingClassifier(n_estimators = 30)#base_estimator=knn(),
                             #bootstrap=True,
                             #bootstrap_features=True,
                             #oob_score=True,
                             #max_features = 10,
                             #max_samples = 100),
        }.get(name, GaussianNB())    # default Gaussian Naive Bayes
예제 #17
0
def chooseClassification(name):
    print "Choosen classfier:", name
    return {
        'NB': GaussianNB(),
        'ADA': adaBoost(n_estimators=2),
        'RF': rf(n_estimators=7),
        'KNN': knn(n_neighbors=15, p=1),
        'SVM': svm.SVC(C=0.01, kernel='rbf', probability=True),
        'BAG': BaggingClassifier(n_estimators=7)  #base_estimator=knn(),
        #bootstrap=True,
        #bootstrap_features=True,
        #oob_score=True,
        #max_features = 10,
        #max_samples = 100),
    }.get(name, GaussianNB())  # default Gaussian Naive Bayes
예제 #18
0
def do_training(df: DataFrame, model_id: str = None) -> str:
    # using random forest as an example
    # can do the training separately and just update the pickles
    from sklearn.ensemble import RandomForestClassifier as rf
    df_ = df[include]

    categoricals = []  # going to one-hot encode categorical variables

    for col, col_type in df_.dtypes.items():
        if col_type == 'O':
            categoricals.append(col)
        else:
            df_[col].fillna(
                0,
                inplace=True)  # fill NA's with 0 for ints/floats, too generic

    # get_dummies effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)
    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    if not model_id:
        model_id = model_default_name

    # capture a list of columns that will be used for prediction
    model_columns_file_name = '{}/{}_columns.pkl'.format(
        model_directory, model_id)
    with lock:
        model_columns[model_id] = list(x.columns)
    joblib.dump(model_columns[model_id], model_columns_file_name)

    # build classifier
    with lock:
        clf[model_id] = rf()
        start = time.time()
        clf[model_id].fit(x, y)

    out_file = '{}/{}.pkl'.format(model_directory, model_id)
    joblib.dump(clf[model_id], out_file)

    message1 = 'Trained in %.5f seconds' % (time.time() - start)
    message2 = 'Model training score: %s' % clf[model_id].score(x, y)
    return_message = 'Success. \n{0}. \n{1}.'.format(message1, message2)
    print(return_message)
    return return_message
예제 #19
0
def train():
    # usando random forest como exemplo
    # pode fazer o treinamento separadamente e apenas atualizar os picles
    from sklearn.ensemble import RandomForestClassifier as rf

    df = pd.read_csv(training_data)
    df_ = df[include]
    # Codificando variáveis categóricas
    categoricals = []

    for col, col_type in df_.dtypes.iteritems():
        if col_type == 'O':
            categoricals.append(col)
        else:
            # Preenchendo possíveis valores com 0
            df_[col].fillna(0, inplace=True)

    # get_dummies efetivamente cria variáveis codificadas de uma só vez
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    # Capturando a lista de colunas que serão usadas para previsão
    global model_columns
    model_columns = list(x.columns)
    joblib.dump(model_columns, model_columns_file_name)

    global clf
    clf = rf()
    start = time.time()
    clf.fit(x, y)

    # Tempo do treino
    print('\033[1;34m' + 'Treinado em %.1f segundos' % (time.time() - start) +
          '\033[0;0m')

    # Acurácia: em média 91% nos testes executados
    print('\033[1;34m' +
          'Acurácia do treinamento do modelo: %s' % clf.score(x, y) +
          '\033[0;0m')

    joblib.dump(clf, model_file_name)

    return 'Modelo Treinado com successo!'
예제 #20
0
def train():
    if not os.path.exists(model_directory):
        os.makedirs(model_directory)

    # using random forest as an example
    # can do the training separately and just update the pickles
    from sklearn.ensemble import RandomForestClassifier as rf

    df = pd.read_csv(training_data)
    df_ = df[include]
    categoricals = []  # going to one-hot encode categorical variables
    for col, col_type in df_.dtypes.items():
        if col_type == 'O':
            categoricals.append(col)
        else:
            # fill NA's with 0 for ints/floats, too generic
            df_[col].fillna(0, inplace=True)

    # get_dummies effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    # capture a list of columns that will be used for prediction
    global model_columns
    model_columns = list(x.columns)
    joblib.dump(model_columns, model_columns_file_name)

    global clf
    clf = rf()
    start = time.time()
    clf.fit(x, y)

    joblib.dump(clf, model_file_name)

    response = make_response({
        'Message': 'Success',
        'Trained in seconds': (time.time() - start),
        'Model training score': clf.score(x, y),
        'max_features': str(clf.max_features),
    })
    response.headers['Content-Type'] = 'application/json; charset=utf-8'
    return response
예제 #21
0
def main():

    #Read the data in
    df = reader()
    #split the dataset
    trainX, testX, trainY, testY = splitDataset(df, .7)

    #Build the random forest classifier, and fit it on the training data
    clf = rf(n_estimators=30).fit(trainX, trainY)
    #Use the model to predict on the test data
    predictions = clf.predict(testX)

    #For the first five observations, print the actual and predicted values of the test data
    for i in range(0, 5):

        print("Actual outcome :: {} and Predicted outcome :: {}".format(
            list(testY)[i], predictions[i]))

    #Various classification metrics, such as accuracy, a confusion matrix for false and true positives and negatives, and roc score
    print("Train Accuracy :: ", accuracy_score(trainY, clf.predict(trainX)))
    print("Test Accuracy  :: ", accuracy_score(testY, predictions))
    print("Confusion matrix :: ", confusion_matrix(testY, predictions))
    print("ROC AUC :: ", roc_auc_score(testY, predictions))

    fpr, tpr, thr = roc_curve(
        testY, predictions)  #false positive, true positive, threshold
    # Different way of calculating the AUC, helps with the plot
    rocAuc = auc(fpr, tpr)

    # Plot of a ROC curve for a specific class
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % rocAuc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    #Pickling the model for later use
    joblib.dump(clf, 'lmmodel.pkl')
예제 #22
0
def train():
    # using random forest as an example
    # can do the training separately and just update the pickles
    from sklearn.ensemble import RandomForestClassifier as rf

    df = pd.read_csv(training_data)
    df_ = df[include]

    categoricals = []  # going to one-hot encode categorical variables

    for col, col_type in df_.dtypes.iteritems():
        if col_type == 'O':
            categoricals.append(col)
        else:
            df_[col].fillna(0, inplace=True)  # fill NA's with 0 for ints/floats, too generic

    # get_dummies effectively creates one-hot encoded variables
    df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=True)

    x = df_ohe[df_ohe.columns.difference([dependent_variable])]
    y = df_ohe[dependent_variable]

    # capture a list of columns that will be used for prediction
    global model_columns
    model_columns = list(x.columns)
    joblib.dump(model_columns, model_columns_file_name)

    global clf
    clf = rf()
    start = time.time()
    clf.fit(x, y)
    print 'Trained in %.1f seconds' % (time.time() - start)
    print 'Model training score: %s' % clf.score(x, y)

    joblib.dump(clf, model_file_name)

    return 'Success'
            mydict[hour]=1
countdf=df.from_dict(mydict,orient='index')
countdf['count']=countdf[0]
countdf=countdf['count']
countdf=countdf.sort_index().values


# In[89]:

target=countdf


# In[90]:

from sklearn.ensemble import RandomForestRegressor as rf
model=rf(n_estimators=100,n_jobs=-1)


# In[91]:

model.fit(X=data[:552,:],y=target[:552])


# In[92]:

predicts=model.predict(data[552:])


# In[93]:

result =df({ 'target' :target[552:],'predict' : predicts})
예제 #24
0
def pickleSave(obj,name):
    pkl_file = open(name, 'wb')
    cPickle.dump(obj,pkl_file,protocol=-1)
    pkl_file.close()

def pickleLoad(name):
    pkl_file = open(name, 'rb')
    obj=cPickle.load(pkl_file)
    pkl_file.close()
    return obj

print "Validation data"

fitForest=False
if fitForest:
    r=rf(1000,n_jobs=5)
    r.fit(trainX,trainY)
    pickleSave(r,"kaggleDiabeticRetinopathyCompetitionModelFiles/modelAverage.forest.pickle")
else:
    r=pickleLoad("kaggleDiabeticRetinopathyCompetitionModelFiles/modelAverage.forest.pickle")
print "accuracy", numpy.mean(r.predict(validationX)==validationY)
print "mse", numpy.mean((r.predict(validationX)-validationY)**2)
a=(r.predict_proba(validationX)*numpy.arange(5).reshape((1,5))).sum(1)

#########################################################################################
testData={"left": {}, "right": {}}

for f in fl:
    for l in open(f+".test"):
        a=l.split("/")[-1].split(",")
        b=a[0].split(".")[0].split("_")
예제 #25
0
# prediction_weekday = prediction_weekday.set_index("datetime")

# prediction=prediction_weekend;
# prediction=prediction.append(prediction_weekday)
# prediction=prediction.sort_index()
# prediction['datetime']=test_factor['datetime']
# prediction=prediction.set_index('datetime')

######################################################################################################################################
######################################################################################################################################

#randomForest modele

######################################################################################################################################
######################################################################################################################################
model = rf(50)
formula = "count ~ season + weather + temp +  windspeed + humidity + holiday + workingday + hour + Sunday + hot"

Y_train = train_factor['count']
X_train = train_factor.drop(['datetime', 'count','casual','registered','date','day'],1)
X_test = test_factor.drop(['datetime','date','day'],  1)
model.fit(X_train,Y_train)
prediction=model.predict(X_test)
prediction[prediction<0]=0
prediction=pd.DataFrame(prediction)
prediction.columns=['count']
prediction['datetime']=test_factor['datetime']
prediction=prediction.set_index("datetime")

#write the submission
prediction.to_csv("Résultats/randomForest50_adrien.csv")
# In[70]:

data['start station cluster']=map(lambda x: station[station['id']==x]['cluster'],data['start station id'])
data['end station cluster']=map(lambda x: station[station['id']==x]['cluster'],data['end station id'])


# In[62]:

station[station['id']==147]['cluster']


# In[75]:

from sklearn.ensemble import RandomForestClassifier as rf
rf1=rf(n_estimators=10,n_jobs=4)


# In[76]:

rf1.fit(train,target)


# In[77]:

rf1


# In[79]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

nn = Classifier(
    layers=[Layer("Softmax")],
    learning_rate=0.001,
    n_iter=25)
nn.fit(train_normalized, target)
y_predict = nn.predict(test_normalized)

ad_fit = ad(n_estimators = 10).fit(X_train,y_train)
y_pred = ad_fit.predict(X_test)
ad_acc = accuracy_score(y_pred, y_test) #0.60

y_pred = rf().fit(X_train,y_train).predict(X_test)
rf_acc = accuracy_score(y_pred, y_test) #0.59

gnb = GaussianNB() 
y_pred = gnb.fit(X_train, y_train).predict(X_test)
gnb_acc = accuracy_score(y_pred, y_test) #0.075 (extremely low)

svc = svm.SVC()
svc = svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
svc_acc = accuracy_score(y_pred, y_test)


ldaC = LDA().fit(X_train, y_train)
y_pred = ldaC.predict(X_test)
lda_acc = accuracy_score(y_pred, y_test) #0.62
예제 #28
0
p_earned = CPS_dataset.wsal_val + CPS_dataset.semp_val + CPS_dataset.frse_val #individual earned income
CPS_dataset['p_earned'] = p_earned



#disabled (check reg if categorical or binary is better after the sum)
CPS_dataset['disability'] = np.zeros(len(CPS_dataset))
CPS_dataset.disability = np.where(CPS_dataset.pedisdrs == 1, 1, CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisear == 1, 1, CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pediseye == 1, 1, CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisout == 1, 1, CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 1, 1, CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 1, 1, CPS_dataset.disability)


Rf = rf(n_estimators = 200) # Creating Random Forest 
CPS_use = CPS_dataset.drop('peridnum', 1)

#Splitting data into training and test sets
train = CPS_use.sample(frac=0.8, random_state=1)
train_x = train.copy()
train_x = train_x.drop(['WIC_child', 'WIC_woman','mig_reg', 'mon', 'mig_div', 'migsame', 'm5g_div', 'm5g_reg','hrnumwic','wicyn', 'mig_reg', 'WIC_infant', 'hrwicyn','pothval', 'hothval', 'fothval','fam_unearned_income','unearned_income',
    'hunits', 'hhpos', 'h_seq', 'hrecord', 'ph_seq',                        
    'hsup_wgt', 'fsup_wgt', 'marsupwt','h_idnum1'],1)


test_x = CPS_use.loc[~CPS_use.index.isin(train_x.index)]
test_y = test_x['WIC_infant']
test_x = test_x.drop(['WIC_child', 'WIC_woman','mig_reg', 'mon', 'mig_div', 'migsame', 'm5g_div', 'm5g_reg','hrnumwic','wicyn', 'mig_reg', 'WIC_infant', 'hrwicyn','pothval', 'hothval', 'fothval','fam_unearned_income','unearned_income',
    'hunits', 'hhpos', 'h_seq', 'hrecord', 'ph_seq',                        
    'hsup_wgt', 'fsup_wgt', 'marsupwt','h_idnum1'],1)
예제 #29
0
파일: script.py 프로젝트: Pegasus99/Kaggle
for i in xrange(2):
    for j in xrange(3):
        train_set.loc[ (train_set["Age"].isnull()) & (train_set.Gender==i )  & (train_set.Pclass==j+1 ) ,"AgeFill"  ] =median_age_train[i,j]
        

for i in xrange(2):
    for j in xrange(3):
        test_set.loc[ (test_set["Age"].isnull()) & (test_set.Gender==i )  & (test_set.Pclass==j+1 ) ,"AgeFill"  ] =median_age_test[i,j]


# nehm mal nur nun numerisch mit



train_set=train_set.drop(["PassengerId","Name","Age","Sex","Ticket","Cabin","Embarked","Fare"], axis=1)
ids = test_set["PassengerId"].values


test_set=test_set.drop(["PassengerId","Name","Age","Sex","Ticket","Cabin","Embarked","Fare"], axis=1)

train_set =train_set.values
test_set =test_set.values
forest =rf(n_estimators=100)
forest.fit(train_set[0::,1::],train_set[0::,0])
output =forest.predict(test_set).astype(int)

prediction_file  =  open("myfirstforest.csv","wb")
open_file_object   = csv.writer(prediction_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids,output))
prediction_file.close()
예제 #30
0
train_data = train_data.drop(['url'], axis=1) #remove 'url' information.
train_data = train_data.drop(['timedelta'], axis=1) #remove 'url' information.

# train_data= train_data[train_data["shares"]<40000]

X = np.array(train_data.drop(['shares'], axis=1))
y = np.array(train_data['shares']) #This is the target
X = preprocessing.scale(X)

XTrain = X[:N,:] #use the first N samples for training
yTrain = y[:N]
XVal = X[N:,:] #use the rests for validation
yVal = y[N:]

Xtest = test_data.values
Xtest = preprocessing.scale(Xtest)
# print type(XTrain) matrix
for i in [10, 20, 50, 100, 200]:
    model = rf(n_estimators = i, n_jobs = 4)
    model.fit(XTrain,yTrain)
    training = model.predict(XTrain)
    validation = model.predict(XVal)

    print "RF" + str(i)
    print "Training error ", np.mean(np.abs(yTrain - training))
    print "Validation error ", np.mean(np.abs(yVal - validation))
    print model.feature_importances_

    result = model.predict(Xtest)
    np.savetxt('result/resultRF' + str(i) + '.txt', result)
예제 #31
0
# Création de la colonne contenant l'heure de la journée
tabtrain['hour']=0
for i in range(len(tabtrain)) :
	tabtrain['hour'][i] = tabtrain['date'][i].hour

tabtrain['hour']=tabtrain['hour'].astype('category')

tabtest['hour']=0
for i in range(len(tabtest)) :
	tabtest['hour'][i] = tabtest['date'][i].hour

tabtest['hour']=tabtest['hour'].astype('category')

# Tableaux d'entrainement
y_train = tabtrain['count']
x_train = tabtrain.drop(['datetime','count','casual','registered','date'],1)

# On forme les tableaux des résultats
x_test = tabtest.drop(['datetime','date'],1)

model = rf(100)

model.fit(x_train, y_train)

y_test = model.predict(x_test)
y_test = pa.DataFrame(y_test)
y_test.index = tabtest['datetime']

y_test.to_csv('csv/rf_matthias_1.csv')

X_train = transform_features(X_train_A) - transform_features(X_train_B)
model = linear_model.LogisticRegression(fit_intercept=False)
model.fit(X_train,y_train)

## compute AuC score on the training data using Logistic Regression / Random Forest ##

# Logistic Regression
p_train = model.predict_proba(X_train)
p_train = p_train[:,1:2]
auc = auc_score(y_train,p_train)
print('AUC score = ', round(auc,3),'Using Logistic Regression' ) 
scores_lr = cv.cross_val_score(model, X_train,y_train,cv=5).mean()

# Random forest
modelrf = rf(n_estimators=300,max_depth=6,max_features='auto',oob_score=True).fit(X_train,y_train)
p_train2 = modelrf.predict_proba(X_train)
p_train2 = p_train2[:,1:2]
auc2 = auc_score(y_train,p_train2)
print('AUC score = ', round(auc2,5),'Using Random Forest' ) 
scores_rf = cv.cross_val_score(modelrf, X_train,y_train,cv=5).mean()


###########################
# LOADING TEST DATA
###########################
 
#ignore the test header
testfile = open('test.csv')
testfile.next()
 
예제 #33
0
			if totalTime < 0:
				continue
			try:
				lst = np.array([int(row[3]), int(row[9]), totalTime])
				exit.append(np.array(status[row[4]]))
				companies.append(lst)
			except ValueError:
				continue

	c = np.array(companies)[0:1700]
	e = np.array(exit)[0:1700]
	c1 = np.array(companies)[1700:]
	e1 = np.array(exit)[1700:]

	tree = rf(criterion='entropy', bootstrap=False, max_depth=5)
	tree.fit(c, e)
	print tree.score(c, e)
	print tree.score(c1, e1)

	testBase = "2021-03"
	testPattern = "%Y-%m"
	testCurrent = int(time.mktime(time.strptime(testBase, testPattern)))

	testBase = "2014-01"
	testPattern = "%Y-%m"
	testStart = int(time.mktime(time.strptime(testBase, testPattern)))
	test_point1 = np.array([2500000, 1, testCurrent - testStart]).reshape(1, -1)
	test_point2 = np.array([3200000, 2, testCurrent - testStart]).reshape(1, -1)
	test_point3 = np.array([40000000, 3, testCurrent - testStart]).reshape(1, -1)
	print "status = {'acquired' : 0, 'ipo' : 1,'operating' : 2, 'closed' : 3}:\n " + ' , '.join(list(map(str, tree.predict_proba(test_point1)[0])))
예제 #34
0
__author__ = 'Gabriel'

import sys
sys.path.append('../')

from sklearn.ensemble import RandomForestClassifier as rf
import opticalCharacterManipulation
import time
from features import *
import numpy

(X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestRawData()
# (X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestFeaturesData(sidePoints)
# (X,Y,X_test,Y_test) = opticalCharacterManipulation.loadTrainAndTestFeaturesData(sidePoints, gravityPoints)

start = time.time()

# Generate model
N = 200
score = 0
for i in range(0, N) :
    classifier = rf()
    classifier.fit(X, numpy.ravel(Y))
    score += classifier.score(X_test,Y_test)
score = round(100 * score / N, 2)
print("Score: " + str(score) + " %")
print("Elapsed time : " + str(time.time() - start))
예제 #35
0
# Création de la colonne contenant l'heure de la journée
tabtrain['hour']=0
for i in range(len(tabtrain)) :
	tabtrain['hour'][i] = tabtrain['date'][i].hour

tabtrain['hour']=tabtrain['hour'].astype('category')

tabtest['hour']=0
for i in range(len(tabtest)) :
	tabtest['hour'][i] = tabtest['date'][i].hour

tabtest['hour']=tabtest['hour'].astype('category')

# Tableaux d'entrainement
y_train = tabtrain['count']
x_train = tabtrain.drop(['datetime','count','casual','registered','date'],1)

# On forme les tableaux des résultats
x_test = tabtest.drop(['datetime','date'],1)

model = rf(200)

model.fit(x_train, y_train)

y_test = model.predict(x_test)
y_test = pa.DataFrame(y_test)
y_test.index = tabtest['datetime']

y_test.to_csv('csv/rf_matthias_1.csv')

tree_test.fit(records_train_x_final, records_train_y)

records_test_y_pred = tree_test.predict(records_test_x_final)

fpr, tpr, thresholds = roc_curve(records_test_y, records_test_y_pred)
plot_roc(fpr, tpr, "DT Grid Search (Test Set)")

# Increased again to 0.7549, showing the grid search is helpful, but not
# optimal.

export_graphviz(tree_test, out_file='tree_test.dot')


##################################
#         RANDOM FOREST          #
##################################

# Lastly, I will look at the random forest model, and see how well it predicts
# the validation set.
rforest = rf(n_estimators=100, max_features='auto', verbose=1, n_jobs=1)
rforest.fit(records_train_x_final, records_train_y)


rf_probabilities = rforest.predict_proba(records_valid_x_final)

roc_auc = roc_auc_score(records_valid_y, rf_probabilities[:,1] )
fpr, tpr, thresholds = roc_curve(records_valid_y, rf_probabilities[:,1])

plot_roc(fpr, tpr, "Random Forest")ppyp