def string_index(spark_df): """Create an index for each of the categorical column of the data set.""" for i in spark_df.columns: inp_col = str(i) out_col = str(i) + "_indexed" fit_on = spark_df.select(str(i)) df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on) indexed_col = df_i_indexed.select(out_col) print(i) indexed_col.printSchema() out_col_ohe = str(i) + "_encoded" try: df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show() df_i_encoded.select(out_col_ohe) vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features") vecAssembler.transform(spark_df) except: pass return None
def load_csv(sc, filename='200[0-5].csv'): sql_context = SQLContext(sc) df = sql_context.read.option('mode', 'PERMISSIVE')\ .load(filename, format='com.databricks.spark.csv', header='true', nullValue='NA', inferSchema='true').cache() df = df[FEATURE_USED] df = df.na.drop() # turn string to index for col in ['UniqueCarrier', 'Origin', 'Dest']: df = StringIndexer(inputCol=col, outputCol=col + '_value').fit(df).transform(df) df = df.drop(col) # reordering df = df.select([ 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value', 'Dest_value', 'Distance', 'Cancelled' ]) return df
# --------------------------------------- df3 = df2.select('Sex','Pclass','Survived','Embarked') df3.show() df3.printSchema() from pyspark.ml.feature import StringIndexer df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3) df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3) #df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3.show() df3.printSchema() df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double')) df3.show() df3.printSchema() # Vector assembler from pyspark.ml.feature import VectorAssembler df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3) df3.show() # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy') # 2 learning process - created a model
df3.show() df3 = OneHotEncoder(inputCol='Embarked1', outputCol='Embarked2', dropLast=False).transform(df3) df3.show() # -------------------------------------------- df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3) df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1', dropLast=False).transform(df3) df3.show() # cast to double df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2, df3.Survived.cast('double')) df3.printSchema() # Vector assembler df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'], outputCol='Features').transform(df3) df3.show(truncate=False) training = df3 training1 = df3 training.show(truncate=False, n=5) # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier
df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Sex','Pclass','Embarked') df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5) df5.show() df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5) df5.show() # -------------------------------------------- df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5) df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5) df5.show() df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId) df5.printSchema() # Vector assembler df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5) df5.show(truncate=False) df5_1 = model2.transform(df5) df5_1.show() df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv') # df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')
# Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy('label', 'prediction').count().show() # One-hot encoding from pyspark.ml.feature import OneHotEncoderEstimator onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy']) # Fit the encoder to the data onehot = onehot.fit(cars) # How many category levels? print(onehot.categorySizes) cars = onehot.transform(cars) cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show() # Dense verse sparse from spark.mllib.linalg import DenseVector, SparseVector DenseVector([1, 0, 0, 0, 0, 7, 0, 0]) SparseVector(8, [0, 5], [1, 7]) # Import the one hot encoder class from pyspark.ml.feature import OneHotEncoderEstimator # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy']) # Apply the one hot encoder to the flights data onehot = onehot.fit(flights) flights_onehot = onehot.transform(flights)
#df3.show(10) #df3.schema #df3.printSchema() ## -------------------------------------------- # # ##df4.show() ##df4.printSchema() # ##fit(si1) #male = 0 #female = 1 # ##transform df3 = df3.select(df3.Pclass.cast('double'), df3.Gender, df3.Survived.cast('double')) df3.printSchema() # Vector assembler from pyspark.ml.feature import VectorAssembler df3 = VectorAssembler(inputCols=['Pclass'], outputCol='Features').transform(df3) df3.show() # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived') # 2 learning process - created a model model2 = dt1.fit(df3)
# #### View the output of the KMeans model # The prediction field denotes the cluster number # In[15]: clusterdData.toPandas().head() # #### Get the average of each feature in the original data # This is the equivalent of the cluster center when our dataset is one big cluster # * We import all sql functions as we need the avg and count functions among others # In[16]: from pyspark.sql.functions import * dataset.select(avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'), avg('Gender'), avg('Boarded')).toPandas() # #### A more intuitive way to view the cluster centers in our clusterdData # * We group by clusterID (prediction) and compute the average of all features # * We do a count of values in each cluster # In[17]: clusterdData.groupBy('prediction').agg( avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'), avg('Gender'), avg('Boarded'), count('prediction')).orderBy('prediction').toPandas() # #### Examine all rows in one of the clusters # In[18]:
flights = flights.dropna() print("\nThe data contains %d records after dropping records with na values." % flights.count()) # Create an indexer for carrier categorical feature indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Check first five records flights_indexed.show(5) flites = flights_indexed.select('carrier', 'org', 'org_idx') # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"]) # Apply the one hot encoder to the flights data onehot = onehot.fit(flites) flights_onehot = onehot.transform(flites) # Check the results flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show() spark.stop()
#Check schema and first rows customers.printSchema() #Schema is ok customers.toPandas().head(5) #Find missings customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings #Renaming the CustomerID column for future joins customers = customers.withColumnRenamed("CustomerID","cIDCustomer") #DELIVERY #Check schema and first rows delivery.printSchema() #Schema is ok delivery.toPandas().head(5) #Find missings delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column #Treating missing values delivery = delivery.where(col("DeliveryClass").isNotNull()) #Encoding string columns in "Delivery" delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery) delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery) #Renaming the SubscriptionID column for future joins delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery") #FORMULA #Check schema and first rows formula.printSchema() #Schema is ok formula.toPandas().head(5)
# Convert 'mile' to 'km' and drop 'mile' column flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \ .drop('mile') # Remove records with missing values in any column and get the number of remaining rows flights = flights.dropna() print("The data contains %d records after dropping records with na values." % flights.count()) # Create an indexer for org categorical feature flights_indexed = StringIndexer( inputCol="org", outputCol='org_idx').fit(flights).transform(flights) # Check first five records #flights_indexed.show(5) flites = flights_indexed.select('km', 'org_idx', 'duration') # Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy' assembler = VectorAssembler(inputCols=['km', 'org_idx'], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flites) # Check the resulting column flites = flights_assembled.select('duration', 'features') #flites.distinct().show(8, truncate=False) print("Sample model input") print(flites.toPandas().sample(12)) # Split the data into training and testing sets
# In[95]: df2.show(5) # In[93]: desidxer_df.show(5) # In[101]: desidxer_df.select(['air_time','distance','carrier_idx','origin_idx','dest_idx']).describe().show() # In[103]: desidxer_df.select(desidxer_df.air_time IsNull()') # In[108]: desidxer_df.select(desidxer_df.air_time =='NA') # In[110]:
def extract_features(df): """ Create a vector Assembler of the features. Arguments: df: Dataframe consisting the relevant data columns. Returns: Dataframe with extracted features in the column "features". """ feature_df = df.select("userId").distinct() col_names = [] ts_dt_udf = udf(lambda x: x // 1000, LongType()) df = df.withColumn("registration_dt", ts_dt_udf(df.registration).cast("timestamp")) df = df.withColumn("timestamp_dt", ts_dt_udf(df.ts).cast("timestamp")) # Session Counts session_counts = df.groupby('userId').agg( countDistinct('sessionId').alias('session_count')) feature_df = feature_df.join(session_counts, on="userId") col_names.append("session_count") # Page Counts pages = df.select('page').distinct().sort('page') pages_list = [r.page for r in pages.collect()] page_counts = df.groupby('userId').pivot('page', pages_list).count() # Drop the "Cancel" page column # Fill NaNs with 0 - This will inherently transform "Cancellation Confirmation" column into "label" # with 1 as churned and 0 as non churned page_counts = page_counts.drop("Cancel") page_counts = page_counts.fillna(value=0) page_counts = page_counts.withColumnRenamed("Cancellation Confirmation", "label") # Join these feature columns to our feature dataframe feature_df = feature_df.join(page_counts, on="userId") # Normalize by session counts cut_columns = {'userId', 'session_count', 'label'} remaining_cols = sorted(list(set(feature_df.columns) - cut_columns)) for column in remaining_cols: feature_df = feature_df.withColumn( column, col(column) / feature_df.session_count) col_names.extend(remaining_cols) # Time since registration user_ages = df.select([ "userId", datediff("timestamp_dt", "registration_dt") ]).groupBy("userId").max().select( "userId", col("max(datediff(timestamp_dt, registration_dt))").alias("age")) feature_df = feature_df.join(user_ages, on="userId") col_names.append("age") # Total number of events user_number_events = df.groupBy("userId").count().select( "userId", col("count").alias("num_events")) feature_df = feature_df.join(user_number_events, on="userId") col_names.append("num_events") # Include device categorical variable device_udf = udf( lambda x: str(re.findall(r'\((.*?)\)', x)[0].split(";")[0].split()[0]) if x is not None else None, StringType()) df = df.withColumn("device", device_udf(df.userAgent)) df_device = df.select(["userId", "device"]).distinct() df_device = StringIndexer( inputCol="device", outputCol="device_index").fit(df_device).transform(df_device) df_device = OneHotEncoderEstimator( inputCols=["device_index"], outputCols=["device_classVec"]).fit(df_device).transform(df_device) feature_df = feature_df.join(df_device.select("userId", "device_classVec"), on="userId") col_names.append("device_classVec") print(col_names) # Assemble the vector assembler = VectorAssembler(inputCols=col_names, outputCol='features') return assembler.transform(feature_df)
#then drop rows with leftover na's #df = df.na.drop(how='any') #df.count() #if loss is big, investigate and fill.na as needed #otherwise, remove df2 #del(df2) #export to csv (via coalesce) print('\n\n\nGetting ready to write data to csv\n') #df = df.select(float_x_vars + cat_x_vars + y_vars) #df = df.na.drop(how='any') #df.coalesce(1).write.csv('data/pdDataNN.csv') #Using pandas pdData = df.select(float_x_vars + cat_x_vars + y_vars) pdData = pdData.na.drop(how='any') #del(df) #pdData.count() #If there is a large loss, then investigate why pdData = pdData.toPandas() pdData.to_csv('data/pdDataNN.csv', index=False) del (pdData) spark.stop()
col_string = col_string.iloc[:, 0].tolist() col = set(col) - set(col_a) col_test = set(col) - set(['HasDetections']) col = list(col) col_test = list(col_test) col_test.append('MachineIdentifier') col_si = [] for i in col: for j in col_string: if i == j: col_si.append(i) col_num = list(set(col) - set(col_si)) test = test.select(col_test) for i in col_test: if i == 'MachineIdentifier': continue else: test = StringIndexer(inputCol=i, outputCol=i + "_index").fit(test).transform(test) # encoder_input_col = [] # for i in col: # encoder_input_col.append(i + '_index') # for i in col_num: # encoder_input_col.append(i) encoder_input_col = [