예제 #1
0
    def __transform_model(self):
        """Train the ALS model with the current dataset
        """
        logger.info("Transforming model 1...")
        self.df_cbg1 = self.df_cbg1.withColumn(
            "latitude", self.df_cbg1["latitude"].cast("double"))
        self.df_cbg1 = self.df_cbg1.withColumn(
            "longitude", self.df_cbg1["longitude"].cast("double"))
        assembler = VectorAssembler(inputCols=["latitude", "longitude"],
                                    outputCol='features')
        self.df_cbg1 = assembler.setHandleInvalid("skip").transform(
            self.df_cbg1)
        logger.info("Done transforming!")

        logger.info("Transforming model 2...")
        self.df_cbg2 = self.df_cbg2.withColumn(
            "latitude", self.df_cbg2["latitude"].cast("double"))
        self.df_cbg2 = self.df_cbg2.withColumn(
            "longitude", self.df_cbg2["longitude"].cast("double"))
        assembler = VectorAssembler(inputCols=["latitude", "longitude"],
                                    outputCol='features')
        self.df_cbg2 = assembler.setHandleInvalid("skip").transform(
            self.df_cbg2)
        logger.info("Done transforming!")

        logger.info("Transforming model 3...")
        self.df_cbg3 = self.df_cbg3.withColumn(
            "latitude", self.df_cbg3["latitude"].cast("double"))
        self.df_cbg3 = self.df_cbg3.withColumn(
            "longitude", self.df_cbg3["longitude"].cast("double"))
        assembler = VectorAssembler(inputCols=["latitude", "longitude"],
                                    outputCol='features')
        self.df_cbg3 = assembler.setHandleInvalid("skip").transform(
            self.df_cbg3)
        logger.info("Done transforming!")
예제 #2
0
def something(context):
    dataset = spark.read.csv(header='true',
                             inferSchema='true',
                             path='./data.csv')
    dataset.withColumn("Quantity", dataset["Quantity"].cast("double")) \
        .withColumn("UnitPrice", dataset["UnitPrice"].cast("float")) \
        .withColumn("CustomerID", dataset["CustomerID"].cast("double"))
    vector_assembler = VectorAssembler(inputCols=['UnitPrice', 'CustomerID'],
                                       outputCol='features')
    vectorized_dataset = vector_assembler.setHandleInvalid("skip").transform(
        dataset)
    vectorized_dataset = vectorized_dataset.select(['features', 'Quantity'])
    lr = LinearRegression(featuresCol='features',
                          labelCol='Quantity',
                          maxIter=10,
                          regParam=0.3,
                          elasticNetParam=0.8)
    splits = vectorized_dataset.randomSplit([0.7, 0.3])
    train_df = splits[0]
    test_df = splits[1]
    lr_model = lr.fit(train_df)
    print("Coefficients: " + str(lr_model.coefficients))
    print("Intercept: " + str(lr_model.intercept))
    training_summary = lr_model.summary
    print("RMSE: %f" % training_summary.rootMeanSquaredError)
    print("r2: %f" % training_summary.r2)
    lr_predictions = lr_model.transform(test_df)
    lr_predictions.select("prediction", "Quantity", "features").show(5)
    lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="Quantity",
                                       metricName="r2")
    print("R Squared (R2) on test data = %g" %
          lr_evaluator.evaluate(lr_predictions))
    lr_model.save('/tmp/regressor.model')
예제 #3
0
    def _sklearn2spark(self, features, labels=None, multi_input=False):
        features_names = []
        if multi_input:
            dataset = pandas.DataFrame()
            c = 0
            for i, feature in enumerate(features):
                feature = feature.toarray().T.tolist() if issparse(feature) else feature.T.tolist()
                for f in feature:
                    dataset['features_%s' % c] = f
                    features_names.append('features_%s' % c)
                    c += 1
            dataset['categorical_label'] = labels if labels is not None else [''] * features.shape[0]
        else:
            dataset = pandas.DataFrame(
                {'features': features.tolist(),
                 'categorical_label': labels if labels is not None else [''] * features.shape[0]
                 })

        spark_dataset_with_list = self.spark_session.createDataFrame(dataset)
        if multi_input:
            # Join all features columns
            assembler = VectorAssembler(inputCols=features_names, outputCol='features')
            assembler.setHandleInvalid('skip')
            spark_dataset_with_list = assembler.transform(spark_dataset_with_list)
            # Join all labels columns
            onehotencoder = OneHotTransformer(output_dim=len(np.unique(labels)), input_col='categorical_label',
                                              output_col='ohe_label')
            spark_dataset_with_list = onehotencoder.transform(spark_dataset_with_list)

        list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

        if multi_input:
            spark_dataset = spark_dataset_with_list.select(
                list_to_vector_udf(spark_dataset_with_list['features']).alias('features'),
                spark_dataset_with_list['ohe_label'].alias('categorical_label')
            )
        else:
            spark_dataset = spark_dataset_with_list.select(
                list_to_vector_udf(spark_dataset_with_list['features']).alias('features'),
                spark_dataset_with_list['categorical_label']
            )

        return spark_dataset
예제 #4
0
def main(sqlContext):
    company_df = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(
            './linear_regression_fortune500/src/inputData/Fortune_500.csv')
    company_df = company_df.withColumn(
        'Number of Employees', regexp_replace('Number of Employees', ',', ''))
    company_df = company_df.withColumn(
        'Number of Employees', company_df['Number of Employees'].cast("int"))
    company_df.show(10)

    company_df.cache()
    company_df.printSchema()

    company_df.describe().toPandas().transpose()

    vectorAssembler = VectorAssembler(
        inputCols=['Rank', 'Number of Employees'], outputCol='features')
    tcompany_df = vectorAssembler.setHandleInvalid("keep").transform(
        company_df)
    tcompany_df = tcompany_df.select(['features', 'Number of Employees'])
    tcompany_df.show(10)

    splits = tcompany_df.randomSplit([0.7, 0.3])
    train_df = splits[0]
    test_df = splits[1]

    lr = LinearRegression(featuresCol='features',
                          labelCol='Number of Employees',
                          maxIter=10,
                          regParam=0.3,
                          elasticNetParam=0.8)
    lr_model = lr.fit(train_df)
    print("Coefficients: " + str(lr_model.coefficients))
    print("Intercept: " + str(lr_model.intercept))

    trainingSummary = lr_model.summary
    print("RMSE on training data: %f" % trainingSummary.rootMeanSquaredError)
    print("r2 on training data: %f" % trainingSummary.r2)

    train_df.describe().show()

    lr_predictions = lr_model.transform(test_df)
    lr_predictions.select("prediction", "Number of Employees",
                          "features").show()

    test_result = lr_model.evaluate(test_df)
    print("Root Mean Squared Error (RMSE) on test data = %g" %
          test_result.rootMeanSquaredError)

    print("numIterations: %d" % trainingSummary.totalIterations)
    print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
    trainingSummary.residuals.show()
예제 #5
0
def analysing_emissions_data(spark, co2_emisssion_data):

    # creating feature vector for sending as input to ML models
    vecAssembler = VectorAssembler(inputCols=['change_in_emissions_scaled'],
                                   outputCol="features")

    # adding feature vector to our aperk dataframe
    co2_emisssion_data = vecAssembler.setHandleInvalid("skip").transform(
        co2_emisssion_data)

    # creating Kmeans object (7 clusters)
    kmeans = KMeans(k=7)

    # clustering operation
    model = kmeans.fit(co2_emisssion_data.select('features'))

    # adding column of predicted clusters to our dataframe
    co2_emisssion_data = model.transform(co2_emisssion_data)

    return co2_emisssion_data.drop("features")
    def generate_clasters(self, df):
        # assemble feathers vector
        vec_assembler = VectorAssembler(
            inputCols=[columns_names.gender,
                       columns_names.education,
                       columns_names.age,
                       columns_names.longitude,
                       columns_names.latitude],
            outputCol="features")

        df = vec_assembler.setHandleInvalid("skip").transform(df)

        # Trains a k-means model.
        kmeans = KMeans().setK(5).setInitMode("k-means||").setSeed(1).setFeaturesCol("features")
        #print(kmeans.explainParams())

        model = kmeans.fit(df)

        # Make predictions
        predictions = model.transform(df)

        return predictions.drop(predictions["features"])
예제 #7
0
        data["trip_duration"] < 22 * 60 * 60).filter(
            data["pickup_longitude"] <= -73.75).filter(
                data["pickup_longitude"] >= -74.03).filter(
                    data["dropoff_longitude"] <= -73.75).filter(
                        data["dropoff_longitude"] >= -74.03).filter(
                            data["pickup_latitude"] <= 40.85).filter(
                                data["pickup_latitude"] >= 40.63).filter(
                                    data["dropoff_latitude"] <= 40.85).filter(
                                        data["dropoff_latitude"] >= 40.63)
    #data.printSchema()
    assembler = VectorAssembler().setInputCols([
        "vendor_id", "pickup_longitude", "pickup_latitude", "pickup_hour",
        "pickup_month", "dropoff_longitude", "dropoff_latitude",
        "trip_distance", "passenger_count"
    ]).setOutputCol("features")
    df = assembler.setHandleInvalid("skip").transform(data).select(
        "trip_duration", "features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=30).fit(df)
    d = featureIndexer.transform(df)
    trainTest = d.randomSplit([0.8, 0.2])
    traindf = trainTest[0]
    testdf = trainTest[1]

    # Model
    dtr = DecisionTreeRegressor(featuresCol="indexedFeatures",
                                labelCol="trip_duration",
                                impurity="variance")

    # choices of tuning parameters
예제 #8
0
#colum features
vecAssembler = VectorAssembler(inputCols=[
    "pickup_longitude", "pickup_latitude", "dropoff_longitude",
    "dropoff_latitude", "passenger_count"
],
                               outputCol="features")
new_df = vecAssembler.transform(df)
new_df.count()

# It's vital to delete null rows because Spark doesn't accept them.

# In[ ]:

#Delete null rows
new_df = vecAssembler.setHandleInvalid("skip").transform(df)
new_df.show()

# ## Train Model

# We use a Linear Regression algorithms to train the model. It's necessary to measure the time the model spends to train the data, which is one of the main purposes of this project.

# In[11]:

# Fit the model
start_time = datetime.now()

lrModel = lr.fit(new_df.select('label', 'features'))

time_elapsed = datetime.now() - start_time
print(
예제 #9
0
spark = spark = SparkSession.builder.appName('LineerRegresyon').getOrCreate()
veri = spark.read.csv('Ecommerce Customers.csv', inferSchema=True, header=True)
veri.printSchema()
veri.show()
veri.head()
veri.show()
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'Avg Session Length', 'Time on App', 'Time on Website',
    'Length of Membership'
],
                            outputCol='features')
VeriVec = assembler.setHandleInvalid("skip").transform(veri)
VeriVec.show()
VeriVec.printSchema()

SonVeri = VeriVec.select('features', 'Yearly Amount Spent')
egitimVeri, testVeri = SonVeri.randomSplit([0.6, 0.4])
egitimVeri.show()

lr = LinearRegression(labelCol='Yearly Amount Spent')
lrModel = lr.fit(egitimVeri)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,
                                              lrModel.intercept))
sonuclar = lrModel.evaluate(testVeri)
sonuclar.residuals.show()
print("RMSE: {}".format(sonuclar.rootMeanSquaredError))
print("MSE: {}".format(sonuclar.meanSquaredError))
예제 #10
0
    "won_baftas",
    "nominated_baftas",
    "actor_id_0",
    "actor_id_1",
    "actor_id_2",
    "actor_id_3",
    "rating_indexed",
    "genre_indexed",
    "country_indexed",
    "youtube_view_count",
    "youtube_engagement_score",
    "youtube_positive_engagement_score",
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_df = assembler.setHandleInvalid("skip").transform(data)

# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234)

num_folds = 5

evaluator = MulticlassClassificationEvaluator(labelCol="success",
                                              predictionCol="prediction",
                                              metricName="accuracy")

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="success",
                            featuresCol="features",
                            numTrees=500)
예제 #11
0
spark


#[3]
dataset = spark.read.format("csv").option("header", "true").load("/Users/sabarnikundu/Downloads/failure_rate.csv")
dataset.show

#[4]
dataset = dataset.withColumn("annual_failure_rate", dataset["annual_failure_rate"].cast("float"))


#[5]
#Convert smart_1_normalized into a feature column and fetch it so that we can pass a vector input to KMeans method
Assembler = VectorAssembler(inputCols=["annual_failure_rate"], outputCol="features")
dataset = Assembler.setHandleInvalid("skip").transform(dataset).na.drop()

#[6]
mod = []
wssses = []
kval = [4,5,6,7]
for val in kval:
    
    print('Value of K = ', val)
    
    kmeans = KMeans().setK(val).setSeed(1)
    model = kmeans.fit(dataset)
    mod.append(model)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(dataset)
예제 #12
0
c = df.schema.names
c = [
    x for x in c if x != "date" and x != "longitude" and x != "latitude"
    and x != "cumLag" and "lag-" not in x
]

# In[ ]:

from pyspark.ml.feature import VectorAssembler, StandardScaler
assembler = VectorAssembler(inputCols=c, outputCol="features")
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)
df1 = assembler.setHandleInvalid("skip").transform(df)
df1.printSchema()
print("df1 count at this point is ", df1.count())
scalarModel = scaler.fit(df1)
df1 = scalarModel.transform(df1)
from pyspark.ml.feature import PCA
pca = PCA(k=40, inputCol="scaledFeatures", outputCol="pcaFeatures")
model = pca.fit(df1)
result = model.transform(df1).select('date', 'latitude', 'longitude',
                                     'pcaFeatures')

# In[ ]:

result = result.coalesce(200)
result.write.parquet(
    "s3a://dse-cohort5-group5/wildfire_capstone/integratedData/completePCA",
                         inferSchema='TRUE')
trns_df.printSchema()
trns_df.head(10)
trns_df.columns
#from pyspark.sql.functions import col
#trns_df=trns_df.withColumn('Total Gas',col('Total Gas').cast('Int'))
#trns_df.printSchema()

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=[
    'Time DGOA to problem', 'kVA (1)', 'kVA (2)', 'Age at DGOA test', 'H2',
    'CH4', 'CO', 'CO2', 'C2H6', 'C2H4', 'C2H2', 'Total Gas'
],
                            outputCol='features')
output_data = assembler.setHandleInvalid('skip').transform(trns_df)
output_data.printSchema()

train_data, test_data = output_data.randomSplit([.8, .2], seed=1234)

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='Failure',
                        featuresCol='features',
                        maxIter=10,
                        regParam=.3)

train_model = lr.fit(train_data)

predictions = train_model.transform(test_data)
predictions.printSchema()
예제 #14
0
def DataPreparation():
    spark = SparkSession.builder.appName('SistemaDeDeteccion').master(
        "local[*]").getOrCreate()  #Creamos la sesión de spark
    data = spark.read.csv("Burnout_Data.csv", header=True,
                          inferSchema=True)  #Cargamos el dataset
    data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes',
                       'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso',
                       'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias',
                       'Sales_Social', 'Edad', 'Estado_Animo',
                       'Tiempo_Vida_Laboral', 'Hijos', 'Lectura',
                       'Hora_Gratificante', 'Horas_Activ_Fisica')
    #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes
    cols = data.columns  #Guardamos en una variable los nombres de las columnas

    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler  #importamos las librerias necesiarias para convertir datos categóricos
    # en datos tratables por los algoritmos, es decir transformandolos a números
    categoricalColumns = [
        'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social',
        'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil'
    ]
    stages = [
    ]  #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine
    for categoricalCol in categoricalColumns:  #indexamos para cada una de las variables categoricas de la lista
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número
        stages += [stringIndexer.setHandleInvalid("keep"), encoder]
        #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    label_stringIdx = StringIndexer(
        inputCol="Burnout_Antes", outputCol="label"
    )  #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores
    #Son VERDADERO y FALSO
    stages += [label_stringIdx.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    numericCols = [
        'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias',
        'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante',
        'Horas_Activ_Fisica'
    ]
    #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas.
    assemblerInputs = [c + "classVec"
                       for c in categoricalColumns] + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas.
    stages += [assembler.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    from pyspark.ml import Pipeline
    pipeline = Pipeline(stages=stages)
    #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages.
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos.
    path = 'modelo_Pipeline'
    os.mkdir(path)
    pipelineModel.save(os.path.join(path, 'Pipeline'))
    #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos
    selectedCols = ['label', 'features'] + cols
    data = data.select(selectedCols)
    #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos

    train, test = data.randomSplit([0.7, 0.3])
    #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas
    print("Training Dataset Count: " + str(train.count()))
    print("Test Dataset Count: " + str(test.count()))
    #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos.
    return train, test
예제 #15
0
sqlContext = SQLContext(sc)

print("Data is loading...")
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(args.input)


data = data.withColumn("Open", data["Open"].cast(DoubleType()))
data = data.withColumn("Date", data["Date"].cast(DoubleType()))
data = data.withColumn("High", data["High"].cast(DoubleType()))
data = data.withColumn("Low", data["Low"].cast(DoubleType()))
data = data.withColumn("Close", data["Close"].cast(DoubleType()))
data = data.withColumn("Adj Close", data["Adj Close"].cast(DoubleType()))
data = data.withColumn("Volume", data["Volume"].cast(DoubleType()))

vectorAssembler = VectorAssembler(inputCols = ['Date','Open','High', 'Low', 'Close', 'Volume'], outputCol = 'features')
vdata=vectorAssembler.setHandleInvalid("skip").transform(data).select(['features','Adj Close'])
splits = vdata.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

print("MLlib is calling...")
start_time = time.time()
lr = LinearRegression(featuresCol = 'features', labelCol='Adj Close', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
#print("Coefficients: " + str(lr_model.coefficients))
#print("Intercept: " + str(lr_model.intercept))
end_time = time.time()
result_str = f"Core: {args.core} - Input: {args.input} - Size: {(os.path.getsize(args.input)/(1024*1024)):.02f} MB - Elapsed time:{(end_time-start_time):.02f} sec"
print(result_str)

with open(f"result.txt", "a+") as result_file: