data = spark.read.csv('./data/train_sample100w.csv', header=True) print('==PREPROCESSING== \n') # Input the selected features col = pd.read_csv('/Users/jaycheng/Dropbox/python/ms_comp/feature1.csv', index_col=0) col = col.iloc[:, 0].tolist() col.append('MachineIdentifier') col.append('HasDetections') data = data.select(col) # drop samples with missing value # data = data.dropna('any') # fill missing value -1 data = data.fillna('-1') print('==StringIndexer== \n') ignore = ['MachineIdentifier', 'HasDetections'] # StringIndexer all features. stringindexer = [ StringIndexer(inputCol=i, outputCol=i + "_index") for i in data.columns if i not in ignore ] pipeline = Pipeline(stages=stringindexer) data = pipeline.fit(data).transform(data) # Fit on whole dataset to include all labels in index. labelindex = StringIndexer(inputCol="HasDetections", outputCol="indexedLabel") data = labelindex.fit(data).transform(data)