def __transform_model(self): """Train the ALS model with the current dataset """ logger.info("Transforming model 1...") self.df_cbg1 = self.df_cbg1.withColumn( "latitude", self.df_cbg1["latitude"].cast("double")) self.df_cbg1 = self.df_cbg1.withColumn( "longitude", self.df_cbg1["longitude"].cast("double")) assembler = VectorAssembler(inputCols=["latitude", "longitude"], outputCol='features') self.df_cbg1 = assembler.setHandleInvalid("skip").transform( self.df_cbg1) logger.info("Done transforming!") logger.info("Transforming model 2...") self.df_cbg2 = self.df_cbg2.withColumn( "latitude", self.df_cbg2["latitude"].cast("double")) self.df_cbg2 = self.df_cbg2.withColumn( "longitude", self.df_cbg2["longitude"].cast("double")) assembler = VectorAssembler(inputCols=["latitude", "longitude"], outputCol='features') self.df_cbg2 = assembler.setHandleInvalid("skip").transform( self.df_cbg2) logger.info("Done transforming!") logger.info("Transforming model 3...") self.df_cbg3 = self.df_cbg3.withColumn( "latitude", self.df_cbg3["latitude"].cast("double")) self.df_cbg3 = self.df_cbg3.withColumn( "longitude", self.df_cbg3["longitude"].cast("double")) assembler = VectorAssembler(inputCols=["latitude", "longitude"], outputCol='features') self.df_cbg3 = assembler.setHandleInvalid("skip").transform( self.df_cbg3) logger.info("Done transforming!")
def something(context): dataset = spark.read.csv(header='true', inferSchema='true', path='./data.csv') dataset.withColumn("Quantity", dataset["Quantity"].cast("double")) \ .withColumn("UnitPrice", dataset["UnitPrice"].cast("float")) \ .withColumn("CustomerID", dataset["CustomerID"].cast("double")) vector_assembler = VectorAssembler(inputCols=['UnitPrice', 'CustomerID'], outputCol='features') vectorized_dataset = vector_assembler.setHandleInvalid("skip").transform( dataset) vectorized_dataset = vectorized_dataset.select(['features', 'Quantity']) lr = LinearRegression(featuresCol='features', labelCol='Quantity', maxIter=10, regParam=0.3, elasticNetParam=0.8) splits = vectorized_dataset.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) training_summary = lr_model.summary print("RMSE: %f" % training_summary.rootMeanSquaredError) print("r2: %f" % training_summary.r2) lr_predictions = lr_model.transform(test_df) lr_predictions.select("prediction", "Quantity", "features").show(5) lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Quantity", metricName="r2") print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions)) lr_model.save('/tmp/regressor.model')
def _sklearn2spark(self, features, labels=None, multi_input=False): features_names = [] if multi_input: dataset = pandas.DataFrame() c = 0 for i, feature in enumerate(features): feature = feature.toarray().T.tolist() if issparse(feature) else feature.T.tolist() for f in feature: dataset['features_%s' % c] = f features_names.append('features_%s' % c) c += 1 dataset['categorical_label'] = labels if labels is not None else [''] * features.shape[0] else: dataset = pandas.DataFrame( {'features': features.tolist(), 'categorical_label': labels if labels is not None else [''] * features.shape[0] }) spark_dataset_with_list = self.spark_session.createDataFrame(dataset) if multi_input: # Join all features columns assembler = VectorAssembler(inputCols=features_names, outputCol='features') assembler.setHandleInvalid('skip') spark_dataset_with_list = assembler.transform(spark_dataset_with_list) # Join all labels columns onehotencoder = OneHotTransformer(output_dim=len(np.unique(labels)), input_col='categorical_label', output_col='ohe_label') spark_dataset_with_list = onehotencoder.transform(spark_dataset_with_list) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) if multi_input: spark_dataset = spark_dataset_with_list.select( list_to_vector_udf(spark_dataset_with_list['features']).alias('features'), spark_dataset_with_list['ohe_label'].alias('categorical_label') ) else: spark_dataset = spark_dataset_with_list.select( list_to_vector_udf(spark_dataset_with_list['features']).alias('features'), spark_dataset_with_list['categorical_label'] ) return spark_dataset
def main(sqlContext): company_df = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( './linear_regression_fortune500/src/inputData/Fortune_500.csv') company_df = company_df.withColumn( 'Number of Employees', regexp_replace('Number of Employees', ',', '')) company_df = company_df.withColumn( 'Number of Employees', company_df['Number of Employees'].cast("int")) company_df.show(10) company_df.cache() company_df.printSchema() company_df.describe().toPandas().transpose() vectorAssembler = VectorAssembler( inputCols=['Rank', 'Number of Employees'], outputCol='features') tcompany_df = vectorAssembler.setHandleInvalid("keep").transform( company_df) tcompany_df = tcompany_df.select(['features', 'Number of Employees']) tcompany_df.show(10) splits = tcompany_df.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] lr = LinearRegression(featuresCol='features', labelCol='Number of Employees', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) trainingSummary = lr_model.summary print("RMSE on training data: %f" % trainingSummary.rootMeanSquaredError) print("r2 on training data: %f" % trainingSummary.r2) train_df.describe().show() lr_predictions = lr_model.transform(test_df) lr_predictions.select("prediction", "Number of Employees", "features").show() test_result = lr_model.evaluate(test_df) print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError) print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show()
def analysing_emissions_data(spark, co2_emisssion_data): # creating feature vector for sending as input to ML models vecAssembler = VectorAssembler(inputCols=['change_in_emissions_scaled'], outputCol="features") # adding feature vector to our aperk dataframe co2_emisssion_data = vecAssembler.setHandleInvalid("skip").transform( co2_emisssion_data) # creating Kmeans object (7 clusters) kmeans = KMeans(k=7) # clustering operation model = kmeans.fit(co2_emisssion_data.select('features')) # adding column of predicted clusters to our dataframe co2_emisssion_data = model.transform(co2_emisssion_data) return co2_emisssion_data.drop("features")
def generate_clasters(self, df): # assemble feathers vector vec_assembler = VectorAssembler( inputCols=[columns_names.gender, columns_names.education, columns_names.age, columns_names.longitude, columns_names.latitude], outputCol="features") df = vec_assembler.setHandleInvalid("skip").transform(df) # Trains a k-means model. kmeans = KMeans().setK(5).setInitMode("k-means||").setSeed(1).setFeaturesCol("features") #print(kmeans.explainParams()) model = kmeans.fit(df) # Make predictions predictions = model.transform(df) return predictions.drop(predictions["features"])
data["trip_duration"] < 22 * 60 * 60).filter( data["pickup_longitude"] <= -73.75).filter( data["pickup_longitude"] >= -74.03).filter( data["dropoff_longitude"] <= -73.75).filter( data["dropoff_longitude"] >= -74.03).filter( data["pickup_latitude"] <= 40.85).filter( data["pickup_latitude"] >= 40.63).filter( data["dropoff_latitude"] <= 40.85).filter( data["dropoff_latitude"] >= 40.63) #data.printSchema() assembler = VectorAssembler().setInputCols([ "vendor_id", "pickup_longitude", "pickup_latitude", "pickup_hour", "pickup_month", "dropoff_longitude", "dropoff_latitude", "trip_distance", "passenger_count" ]).setOutputCol("features") df = assembler.setHandleInvalid("skip").transform(data).select( "trip_duration", "features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=30).fit(df) d = featureIndexer.transform(df) trainTest = d.randomSplit([0.8, 0.2]) traindf = trainTest[0] testdf = trainTest[1] # Model dtr = DecisionTreeRegressor(featuresCol="indexedFeatures", labelCol="trip_duration", impurity="variance") # choices of tuning parameters
#colum features vecAssembler = VectorAssembler(inputCols=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count" ], outputCol="features") new_df = vecAssembler.transform(df) new_df.count() # It's vital to delete null rows because Spark doesn't accept them. # In[ ]: #Delete null rows new_df = vecAssembler.setHandleInvalid("skip").transform(df) new_df.show() # ## Train Model # We use a Linear Regression algorithms to train the model. It's necessary to measure the time the model spends to train the data, which is one of the main purposes of this project. # In[11]: # Fit the model start_time = datetime.now() lrModel = lr.fit(new_df.select('label', 'features')) time_elapsed = datetime.now() - start_time print(
spark = spark = SparkSession.builder.appName('LineerRegresyon').getOrCreate() veri = spark.read.csv('Ecommerce Customers.csv', inferSchema=True, header=True) veri.printSchema() veri.show() veri.head() veri.show() from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership' ], outputCol='features') VeriVec = assembler.setHandleInvalid("skip").transform(veri) VeriVec.show() VeriVec.printSchema() SonVeri = VeriVec.select('features', 'Yearly Amount Spent') egitimVeri, testVeri = SonVeri.randomSplit([0.6, 0.4]) egitimVeri.show() lr = LinearRegression(labelCol='Yearly Amount Spent') lrModel = lr.fit(egitimVeri) print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept)) sonuclar = lrModel.evaluate(testVeri) sonuclar.residuals.show() print("RMSE: {}".format(sonuclar.rootMeanSquaredError)) print("MSE: {}".format(sonuclar.meanSquaredError))
"won_baftas", "nominated_baftas", "actor_id_0", "actor_id_1", "actor_id_2", "actor_id_3", "rating_indexed", "genre_indexed", "country_indexed", "youtube_view_count", "youtube_engagement_score", "youtube_positive_engagement_score", ] assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") assembled_df = assembler.setHandleInvalid("skip").transform(data) # Split the data into training and test sets (30% held out for testing) (training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234) num_folds = 5 evaluator = MulticlassClassificationEvaluator(labelCol="success", predictionCol="prediction", metricName="accuracy") # Train a RandomForest model. rf = RandomForestClassifier(labelCol="success", featuresCol="features", numTrees=500)
spark #[3] dataset = spark.read.format("csv").option("header", "true").load("/Users/sabarnikundu/Downloads/failure_rate.csv") dataset.show #[4] dataset = dataset.withColumn("annual_failure_rate", dataset["annual_failure_rate"].cast("float")) #[5] #Convert smart_1_normalized into a feature column and fetch it so that we can pass a vector input to KMeans method Assembler = VectorAssembler(inputCols=["annual_failure_rate"], outputCol="features") dataset = Assembler.setHandleInvalid("skip").transform(dataset).na.drop() #[6] mod = [] wssses = [] kval = [4,5,6,7] for val in kval: print('Value of K = ', val) kmeans = KMeans().setK(val).setSeed(1) model = kmeans.fit(dataset) mod.append(model) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(dataset)
c = df.schema.names c = [ x for x in c if x != "date" and x != "longitude" and x != "latitude" and x != "cumLag" and "lag-" not in x ] # In[ ]: from pyspark.ml.feature import VectorAssembler, StandardScaler assembler = VectorAssembler(inputCols=c, outputCol="features") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) df1 = assembler.setHandleInvalid("skip").transform(df) df1.printSchema() print("df1 count at this point is ", df1.count()) scalarModel = scaler.fit(df1) df1 = scalarModel.transform(df1) from pyspark.ml.feature import PCA pca = PCA(k=40, inputCol="scaledFeatures", outputCol="pcaFeatures") model = pca.fit(df1) result = model.transform(df1).select('date', 'latitude', 'longitude', 'pcaFeatures') # In[ ]: result = result.coalesce(200) result.write.parquet( "s3a://dse-cohort5-group5/wildfire_capstone/integratedData/completePCA",
inferSchema='TRUE') trns_df.printSchema() trns_df.head(10) trns_df.columns #from pyspark.sql.functions import col #trns_df=trns_df.withColumn('Total Gas',col('Total Gas').cast('Int')) #trns_df.printSchema() from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'Time DGOA to problem', 'kVA (1)', 'kVA (2)', 'Age at DGOA test', 'H2', 'CH4', 'CO', 'CO2', 'C2H6', 'C2H4', 'C2H2', 'Total Gas' ], outputCol='features') output_data = assembler.setHandleInvalid('skip').transform(trns_df) output_data.printSchema() train_data, test_data = output_data.randomSplit([.8, .2], seed=1234) from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol='Failure', featuresCol='features', maxIter=10, regParam=.3) train_model = lr.fit(train_data) predictions = train_model.transform(test_data) predictions.printSchema()
def DataPreparation(): spark = SparkSession.builder.appName('SistemaDeDeteccion').master( "local[*]").getOrCreate() #Creamos la sesión de spark data = spark.read.csv("Burnout_Data.csv", header=True, inferSchema=True) #Cargamos el dataset data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Tiempo_Vida_Laboral', 'Hijos', 'Lectura', 'Hora_Gratificante', 'Horas_Activ_Fisica') #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes cols = data.columns #Guardamos en una variable los nombres de las columnas from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler #importamos las librerias necesiarias para convertir datos categóricos # en datos tratables por los algoritmos, es decir transformandolos a números categoricalColumns = [ 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil' ] stages = [ ] #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine for categoricalCol in categoricalColumns: #indexamos para cada una de las variables categoricas de la lista stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número stages += [stringIndexer.setHandleInvalid("keep"), encoder] #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve label_stringIdx = StringIndexer( inputCol="Burnout_Antes", outputCol="label" ) #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores #Son VERDADERO y FALSO stages += [label_stringIdx.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve numericCols = [ 'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante', 'Horas_Activ_Fisica' ] #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas. assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas. stages += [assembler.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages. pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos. path = 'modelo_Pipeline' os.mkdir(path) pipelineModel.save(os.path.join(path, 'Pipeline')) #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos selectedCols = ['label', 'features'] + cols data = data.select(selectedCols) #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos train, test = data.randomSplit([0.7, 0.3]) #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas print("Training Dataset Count: " + str(train.count())) print("Test Dataset Count: " + str(test.count())) #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos. return train, test
sqlContext = SQLContext(sc) print("Data is loading...") data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(args.input) data = data.withColumn("Open", data["Open"].cast(DoubleType())) data = data.withColumn("Date", data["Date"].cast(DoubleType())) data = data.withColumn("High", data["High"].cast(DoubleType())) data = data.withColumn("Low", data["Low"].cast(DoubleType())) data = data.withColumn("Close", data["Close"].cast(DoubleType())) data = data.withColumn("Adj Close", data["Adj Close"].cast(DoubleType())) data = data.withColumn("Volume", data["Volume"].cast(DoubleType())) vectorAssembler = VectorAssembler(inputCols = ['Date','Open','High', 'Low', 'Close', 'Volume'], outputCol = 'features') vdata=vectorAssembler.setHandleInvalid("skip").transform(data).select(['features','Adj Close']) splits = vdata.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] print("MLlib is calling...") start_time = time.time() lr = LinearRegression(featuresCol = 'features', labelCol='Adj Close', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_df) #print("Coefficients: " + str(lr_model.coefficients)) #print("Intercept: " + str(lr_model.intercept)) end_time = time.time() result_str = f"Core: {args.core} - Input: {args.input} - Size: {(os.path.getsize(args.input)/(1024*1024)):.02f} MB - Elapsed time:{(end_time-start_time):.02f} sec" print(result_str) with open(f"result.txt", "a+") as result_file: