def getTestData(dict,size=None,featureIndices=None,): test = pd.read_csv('./input/test.csv') test=test.fillna(0) # train=train.fillna(train.mean()) test=test.values if size: test=test[:size,:] if featureIndices: indices=featureIndices else: indices=list(range(1,test.shape[1]))#All indices except ID and except response # print(indices) changed=utils.categoricalToNumerical(test[:,2],dict) print("number of categories of column 2 of train and test:",changed[1]) test[:,2]=changed[0] X= test[:,indices] ids=test[:,0] return X,ids
def getTrainData(size=None,featureIndices=None): train = pd.read_csv('./input/train.csv') train=train.fillna(0) # train=train.fillna(train.mean()) train=train.values if size: train=train[:size,:] if featureIndices: indices=featureIndices else: indices=list(range(1,train.shape[1]-1))#All indices except ID and except response # print(indices) changed=utils.categoricalToNumerical(train[:,2]) dict=changed[2] print("number of categories of column 2 of train:",changed[1]) train[:,2]=changed[0] X, y = train[:,indices] , train[:,-1] y=list(map(np.int32,y)) return X,y,dict
from sklearn.ensemble import ExtraTreesClassifier train = pd.read_csv('./input/train.csv') # print(train.isnull().sum()) train=train.fillna(0) # print(train.isnull().sum()) train=train.values # train=train[0:100,:] # Build a classification task using 3 informative features indeces=(range(train.shape[1]-1)) indeces=list(set(indeces)-set([0]))#removing ID # print(indeces) # print(train.shape[1]) changed=utils.categoricalToNumerical(train[:,2]) print("number of categories of column 2:",changed[1]) train[:,2]=changed[0] X, y = train[:,indeces] , train[:,-1] y=list(map(np.int32,y)) # print(X) numfeatures=len(indeces) print(type(X)) # X=X[:,indeces] # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250,random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_],