def indexer(dataset): dataset = StringIndexer( inputCol='Sex', outputCol='Gender', handleInvalid='keep').fit(dataset).transform(dataset) dataset = StringIndexer( inputCol='Embarked', outputCol='Boarded', handleInvalid='keep').fit(dataset).transform(dataset) dataset.drop('Sex', 'Embarked') return dataset
def encode_using_one_hot(df, column_name): ''' Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed. ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories) ''' indexed_name = 'index_'+column_name vectored_name = 'vec_'+column_name df = StringIndexer(inputCol=column_name, outputCol=indexed_name, handleInvalid="skip").fit(df).transform(df) encoder = OneHotEncoderEstimator( inputCols=[indexed_name], outputCols=[vectored_name]) model = encoder.fit(df) df = model.transform(df) df = df.drop(indexed_name) df = df.drop(column_name) df = df.withColumnRenamed(vectored_name, column_name) return df
def encode_using_indexer(df, column_name): ''' Transforms a df at a particular column_name by converting all unique categories to an index as they get processed. ie. If values in column are {a,b,b,c,d} => {0.0,1.0,1.0,2.0,3.0} (Good for Binary) ''' indexed_name = 'index_'+column_name df = StringIndexer(inputCol=column_name, outputCol=indexed_name, handleInvalid="skip").fit(df).transform(df) df = df.drop(column_name) df = df.withColumnRenamed(indexed_name, column_name) return df
def main(config): # Cookie cutter sequence of processes involved in running the # necessary steps. Using the general pipeline outlined in Spark's # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html spark = spark_initiate() # some data / tramsformer raw_data = config['base']['train_df'] structure_schema = model_structure() data = load_data(spark, raw_data, 'df', structure_schema) # data.show() df, cat_dict = transformer(data) datatype_dict = dict(df.dtypes) features = config['base']['featuresCol'].split(',') list_str = [] # list of string columns for feature in features: if datatype_dict[feature] == 'string': list_str.append(feature) df = StringIndexer(inputCol=feature, outputCol=feature + '_index' ) \ .fit(df) \ .transform(df) df = df.drop(*list_str) df.show() features = list(set(df.columns) - set(config['base']['labelCol'])) assembler = VectorAssembler(inputCols=features, outputCol='features') df = assembler.transform(df) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # estimator model = estimators(config) fitted_model = model.fit(trainingData) testData = fitted_model.transform(testData) predictionAndLabels = testData.select('probability','Survived') \ .rdd.map(lambda x: (float(x[0][0]), float(x[1]) ) ) metrics = BinaryClassificationMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC)
def load_csv(sc, filename='200[0-5].csv'): sql_context = SQLContext(sc) df = sql_context.read.option('mode', 'PERMISSIVE')\ .load(filename, format='com.databricks.spark.csv', header='true', nullValue='NA', inferSchema='true').cache() df = df[FEATURE_USED] df = df.na.drop() # turn string to index for col in ['UniqueCarrier', 'Origin', 'Dest']: df = StringIndexer(inputCol=col, outputCol=col + '_value').fit(df).transform(df) df = df.drop(col) # reordering df = df.select([ 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value', 'Dest_value', 'Distance', 'Cancelled' ]) return df
dataset = StringIndexer(inputCol='Sex', outputCol='Gender', handleInvalid='keep').fit(dataset).transform(dataset) dataset = StringIndexer(inputCol='Embarked', outputCol='Boarded', handleInvalid='keep').fit(dataset).transform(dataset) dataset.toPandas().head() # #### Drop the redundant columns # In[6]: dataset = dataset.drop('Sex') dataset = dataset.drop('Embarked') dataset.toPandas().head() # #### Define the required features to use in the VectorAssembler # Since we are only examining data and not making predictions, we include all columns # In[7]: requiredFeatures = ['Survived', 'Pclass', 'Age', 'Fare', 'Gender', 'Boarded'] # #### The VectorAssembler vectorises all the features # The transformed data will be used for clustering # In[8]:
from pyspark.ml.feature import VectorAssembler, StringIndexer from pyspark.ml.classification import DecisionTreeClassifier spark = SparkSession.builder.getOrCreate() df2 = spark.read.csv( 'C:/Manidhar/MachineLearningLab/datasets/titanic/train.csv', header=True) df2.show() df2.describe().show() df2.printSchema() df3 = df2.select('Pclass', 'Sex', 'Survived') df3.printSchema() df3.show() df4 = df2.filter(df2.Age > 40).select('Pclass', 'SibSp', 'Survived') df4.show() df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3) df3 = df3.drop('Sex') df3.show() df4.count() df5 = df2.filter(df2.Age > 20).select('Pclass', 'SibSp', 'Survived') df5.count() df3 = df3.select(df3.Pclass.cast('double'), df3.SibSp.cast('double'), df3.Survived.cast('double'), df3.Fare.cast('double')) df3.printSchema() df3 = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Fare'], outputCol='Features').transform(df3) df3.show() dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived', maxDepth=10,
# In[90]: desidxer_df= StringIndexer(inputCol='dest', outputCol='dest_idx').fit(orgidxer_df).transform(orgidxer_df) # In[91]: desidxer_df.show(5) # In[94]: df2= desidxer_df.drop('carrier', 'origin','dest') # In[95]: df2.show(5) # In[93]: desidxer_df.show(5) # In[101]:
def main(): spark, sc = init_spark() df = (spark.read.format("csv").option( 'header', 'true').load("C:\\sparkTmp\\titanic_train.csv")) df.show(5) # How many rows we have print(df.count()) # The names of our columns print(df.columns) # Types of our columns print(df.dtypes) print(df.describe()) dataset = df.select( col("Survived").cast("float"), col("Pclass").cast("float"), col("Sex"), col("Age").cast("float"), col("Fare").cast("float"), col("Embarked"), ) dataset.show() dataset.select( [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() dataset = dataset.dropna(how="any") dataset.select( [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() # We need to transform Sex and Embarked to numerical value dataset = StringIndexer( inputCol="Sex", outputCol="Gender", handleInvalid="keep").fit(dataset).transform(dataset) dataset = StringIndexer( inputCol="Embarked", outputCol="Boarded", handleInvalid="keep").fit(dataset).transform(dataset) # StringIndexer transforms not just to a plain double, but preserves category print(dataset.schema.fields[7].metadata) dataset = dataset.drop("Sex") dataset = dataset.drop("Embarked") dataset.show() required_features = ["Pclass", "Age", "Fare", "Gender", "Boarded"] assembler = VectorAssembler(inputCols=required_features, outputCol='features') transformed_data = assembler.transform(dataset) transformed_data.show() (training_data, test_data) = transformed_data.randomSplit([0.8, 0.2]) rf = RandomForestClassifier(labelCol="Survived", featuresCol="features", maxDepth=5) model = rf.fit(training_data) predictions = model.transform(test_data) evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Accuracy = ", accuracy)