Python loadData示例

编程语言: Python

命名空间/包名称: mungetools

方法/功能: loadData

hotexamples.com的示例: 5

Python loadData - 已找到5个示例。这些是从开源项目中提取的最受好评的mungetools.loadData现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

# load data into pandas data frame
trdata, testdata = mg.loadData()

# get the id's for the test set
testid = np.array(testdata.UserID)

testdata = testdata.drop('UserID', axis=1)

# initialize classifier

depthlist = [3, 5, 10, 15, 20, 50, 100]

for i in depthlist:

    model = rfc(n_estimators=10,
                oob_score=True,
                max_features=None,
                max_depth=i)

    model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0])

    accur = model.oob_score_

    print('Out of Bag accuracy: %f \n' % accur)

# generate predictions
preds = np.array(model.predict_proba(testdata))[:, 1]

示例#2

显示文件

文件： rfcmodel.py 项目： kjford/Titanic

import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

'''
Use random forrest classifier to predict Titanic survivors
Uses training data in train.csv (found in data subfolder)
predicts from test.csv
writes out to .csv in predictions subfolder
As is, this gives ~77% accuracy on test set
This can hit ~79% with some tweaking (currently overfits)
'''

# load data into pandas data frame
trdata,testdata=mg.loadData()

# get the id's for the test set
testid = np.array(testdata.PassengerId)

# determine if each passenger has a known surviving family member
trdata,tesrdata=mg.addFamSurvivors(trdata,testdata)

# munge the data to generate one-hot labels for gender, titles, ticket departments
trdata=mg.mungeData(trdata)
testdata=mg.mungeData(testdata)


# initialize classifier

model= rfc(n_estimators=1000,oob_score=True,compute_importances=True)

示例#3

显示文件

文件： feature_transformation_using_tree.py 项目： LiuJundi/Undergraduate-dissertation

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
# from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline
import mungetools as mg

n_estimator = 10
# X, y = make_classification(n_samples=10)

# print X
# print y

trdata = mg.loadData()

X_train, X_test, y_train, y_test = trdata.iloc[:500, 1:], trdata.iloc[
    500:, 1:], trdata.iloc[:500, 0], trdata.iloc[500:, 0]
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = X_train.iloc[:250, ], X_train.iloc[
    250:, ], y_train.iloc[:250, ], y_train.iloc[250:, ]

print X_train
print y_train

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,

示例#4

显示文件

文件： svmmodel.py 项目： wjtcute/German_credit_data_test

            scores[counter]=np.mean(scorei)
            paramholder[counter,0]=c
            paramholder[counter,1]=g
            if scorei>bestscore:
                bestscore=scorei
                bestmodel=model
            counter+=1
            print('Score = %f with c: %f, g: %f' %(scorei,c,g))
    bestc=paramholder[scores.argmax(),0]
    bestg=paramholder[scores.argmax(),1]
    print('Best score of %f with c: %f, g: %f' %(bestscore,bestc,bestg))
    return bestmodel

    

trdata=mg.loadData()

# testid = np.array(testdata.UserID)

# trdata = trdata.drop(['coursecount'],axis=1)

# testdata = testdata.drop(['UserID'],axis=1)

# print trdata
# print testdata

# initialize classifier
# try several values of c (prediction error weight) and g (kernel width)
testc = [0.05, 0.1, 0.3, 0.6, 1, 3, 5, 10 ]
testg = [0, 0.01, 0.05, 0.1, 0.5, 1, 1.5]

示例#5

显示文件

文件： feature_importance_plot.py 项目： LiuJundi/Undergraduate-dissertation

features_list = []


for line in file_data:
    if '@' in line and '{' in line:
        feature = line.split()[1]
        features_list.append(feature)

# print features_list

features_list = np.asarray(features_list)

print type(features_list)

input_df=mg.loadData()
X = input_df.values[:, 1:]
y = input_df.values[:, 0]
survived_weight = .75
y_weights = np.array([survived_weight if s == 0 else 1 for s in y])
 
print "Rough fitting a RandomForest to determine feature importance..."
forest = RandomForestClassifier(oob_score=True, n_estimators=10)
forest.fit(X, y, sample_weight=y_weights)
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

print feature_importance

fi_threshold = 30   
important_idx = np.where(feature_importance > fi_threshold)[0]