Exemplo n.º 1
0
def getTestData(dict,size=None,featureIndices=None,):
    test = pd.read_csv('./input/test.csv')
    test=test.fillna(0)
    # train=train.fillna(train.mean())
    test=test.values
    if size:
        test=test[:size,:]
    if featureIndices:
        indices=featureIndices
    else:
        indices=list(range(1,test.shape[1]))#All indices except ID and except response
    # print(indices)
    changed=utils.categoricalToNumerical(test[:,2],dict)
    print("number of categories of column 2 of train and test:",changed[1])
    test[:,2]=changed[0]

    X= test[:,indices]
    ids=test[:,0]
    return X,ids
Exemplo n.º 2
0
def getTrainData(size=None,featureIndices=None):
    train = pd.read_csv('./input/train.csv')
    train=train.fillna(0)
    # train=train.fillna(train.mean())
    train=train.values
    if size:
        train=train[:size,:]
    if featureIndices:
        indices=featureIndices
    else:
        indices=list(range(1,train.shape[1]-1))#All indices except ID and except response
    # print(indices)
    changed=utils.categoricalToNumerical(train[:,2])
    dict=changed[2]
    print("number of categories of column 2 of train:",changed[1])
    train[:,2]=changed[0]

    X, y = train[:,indices] , train[:,-1]

    y=list(map(np.int32,y))
    return X,y,dict
Exemplo n.º 3
0
from sklearn.ensemble import ExtraTreesClassifier


train = pd.read_csv('./input/train.csv')
# print(train.isnull().sum())
train=train.fillna(0)
# print(train.isnull().sum())

train=train.values
# train=train[0:100,:]
# Build a classification task using 3 informative features
indeces=(range(train.shape[1]-1))
indeces=list(set(indeces)-set([0]))#removing ID
# print(indeces)
# print(train.shape[1])
changed=utils.categoricalToNumerical(train[:,2])
print("number of categories of column 2:",changed[1])
train[:,2]=changed[0]

X, y = train[:,indeces] , train[:,-1]
y=list(map(np.int32,y))
# print(X)
numfeatures=len(indeces)
print(type(X))
# X=X[:,indeces]
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],