# real example



users_noscaled=users_addedmonths

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

#call the vector assembler
assembler = VectorAssembler(
  inputCols=users_noscaled.columns[7:], outputCol='assembled_col'
)

#call the scaler
scaler = MinMaxScaler(
  inputCol="assembled_col", outputCol="assembled_col_norm"
)

#build an assembleed vector in the dataframe
assembled=assembler.transform(users_noscaled)

#build the scaler model
scaler_model= scaler.fit(assembled)

#Apply the model to the transformed dataframe
users_wscaled=scaler_model.transform(assembled)

Exemplo n.º 2
0
    def scaleVecCol(self, columns, nameOutputCol):
        """
        This function groups the columns specified and put them in a list array in one column, then a scale
        process is made. The scaling proccedure is spark scaling default (see the example
        bellow).

        +---------+----------+
        |Price    |AreaLiving|
        +---------+----------+
        |1261706.9|16        |
        |1263607.9|16        |
        |1109960.0|19        |
        |978277.0 |19        |
        |885000.0 |19        |
        +---------+----------+

                    |
                    |
                    |
                    V
        +----------------------------------------+
        |['Price', 'AreaLiving']                 |
        +----------------------------------------+
        |[0.1673858972637624,0.5]                |
        |[0.08966137157852398,0.3611111111111111]|
        |[0.11587093205757598,0.3888888888888889]|
        |[0.1139820728616421,0.3888888888888889] |
        |[0.12260126542983639,0.4722222222222222]|
        +----------------------------------------+
        only showing top 5 rows

        """

        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        # Check if nameOutputCol argument a string datatype:
        self.__assertTypeStr(nameOutputCol, "nameOutpuCol")

        # Model to use vectorAssember:
        vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler")
        # Model for scaling feature column:
        mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol)
        # Dataframe with feature_assembler column
        tempDF = vecAssembler.transform(self.__df)
        # Fitting scaler model with transformed dataframe
        model = mmScaler.fit(tempDF)

        exprs = list(filter(lambda x: x not in columns, self.__df.columns))

        exprs.extend([nameOutputCol])

        self.__df = model.transform(tempDF).select(*exprs)
        self.__addTransformation()  # checkpoint in case

        return self
    Vectors.dense([1.0, 0.1, -1.0]),
), (
    1,
    Vectors.dense([2.0, 1.1, 1.0]),
), (
    2,
    Vectors.dense([3.0, 10.1, 3.0]),
)], ["id", "features"])

# In[20]:

dataFrame.show()

# In[21]:

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# In[22]:

scaler

# In[23]:

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# In[24]:

scalerModel

# In[25]:
Exemplo n.º 4
0
df_transformed.show()

logger.error("#### before spliting")

train, test = df_transformed.randomSplit(
    [float(sys.argv[1]), float(sys.argv[2])], seed=7)
#train60,test40 = df_transformed.randomSplit([0.6,0.4],seed=7)
#train70,test30 = df_transformed.randomSplit([0.7, 0.3], seed=7)
#train80,test20 = df_transformed.randomSplit([0.8,0.2],seed=7)
#train90,test10 = df_transformed.randomSplit([0.9,0.1],seed=7)

logger.error("#### after split")
logger.error("#### Random Forest")

from pyspark.ml.classification import RandomForestClassifier
minmax = MinMaxScaler(inputCol="features", outputCol="normFeatures")
rf = RandomForestClassifier(featuresCol='normFeatures', labelCol='label')

stages2 = []
#stages += string_indexer
#stages += one_hot_encoder
#stages2 += [vector_assembler]
stages2 += [minmax]
stages2 += [rf]

from pyspark.ml import Pipeline

pipeline2 = Pipeline().setStages(stages2)

rf_model60 = pipeline2.fit(train)
Exemplo n.º 5
0
    # Step - 4: Make Vectors from dataframe's columns using special Vector Assmebler
    assembler = VectorAssembler(inputCols=[
        "pclass_imputed", "sibsp_imputed", "parch_imputed",
        "sexIndexed_imputed", "embarkedIndexed_imputed", "age_imputed",
        "fare_imputed"
    ],
                                outputCol="unscaled_features")

    # Step - 5: Define Polynomial Expansion with degree=2
    polyExpansion = PolynomialExpansion(degree=2,
                                        inputCol="unscaled_features",
                                        outputCol="polyFeatures")

    # Step - 5: Define Scaler
    scaler = MinMaxScaler(inputCol="polyFeatures", outputCol="unnorm_features")

    # Step - 6: Define Normalizer
    normalizer = Normalizer(p=1.0,
                            inputCol="unnorm_features",
                            outputCol="features")

    # Step - 7: Set up the Decision Tree Classifier
    trainer = DecisionTreeClassifier(labelCol="survived",
                                     featuresCol="features")

    # Step - 8: Build the Pipeline
    pipeline = Pipeline(stages=[
        sexIndexer,
        embarkedIndexer,
        imputer,
Exemplo n.º 6
0
def main():
    args = parse_arguments()
    setup_spark()
    df = read_file(args)

    # transform
    assembler = VectorAssembler(inputCols=["hour"], outputCol="hour_vector")
    df = assembler.transform(df)
    indexers = [StringIndexer(inputCol=column, outputCol=column + "_idx").fit(df)
                for column in list(set(df.columns) - set(['id',
                                                          'device_ip',
                                                          'hour',
                                                          'click',
                                                          'hour_vector',
                                                          'device_id',
                                                          'device_model',
                                                          'site_domain',
                                                          'site_id',
                                                          'app_id',
                                                          'c14',
                                                          'app_domain',
                                                          'c17',
                                                          'c20']))] \
               + [MinMaxScaler(inputCol='hour_vector', outputCol='hour_scalar').fit(df)]
    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)

    func = udf(lambda v: float(v[0]), FloatType())

    df = df.withColumn('hour_std', func('hour_scalar'))
    df = df[[w for w in list(df.columns) if 'idx' in w] + ['hour_std', 'click']].cache()

    # to pandas and make config
    config_pd = df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).toPandas()
    multi_index = []
    for c in config_pd.columns:
        if '_idx' in c:
            multi_index.append('sparse')
        elif c == 'click':
            multi_index.append('label')
        else:
            multi_index.append('dense')
    config_pd.columns = pd.MultiIndex.from_tuples(zip(multi_index, config_pd.columns))
    s = config_pd.iloc[0]
    dic = {l: s.xs(l).to_dict() for l in s.index.levels[0]}

    if not os.path.exists(args.output_path):
        os.system('mkdir {}'.format(args.output_path))
    with open(os.path.join(args.output_path, 'config.yaml'), 'w', encoding="utf-8") as fw:
        yaml.dump(dic, fw, default_flow_style=False, indent=4)


    # stats count
    total_num = df.count()
    pos_num = df.filter(df.click == 1).count()
    neg_num = df.filter(df.click != 1).count()
    print('#'*20)
    print('raw totle_num:{} pos_num:{} neg_num:{}'.format(total_num,
                                                          pos_num,
                                                          neg_num))

    # sample
    pos_df = df[df.click == 1]
    neg_df = df[df.click != 1].sample(False, 0.5, seed=1234)
    df = pos_df.union(neg_df)
    print('union totle_num:{} pos_num:{} neg_num:{}'.format(df.count(),
                                                            pos_df.count(),
                                                            neg_df.count()))
    print('#'*20)
    # split dataset
    train_df, val_df =df.randomSplit([0.9, 0.1])
    train_df.repartition(1).write.json(os.path.join(args.output_path, 'train'))
    val_df.repartition(1).write.json(os.path.join(args.output_path, 'val'))
# COMMAND ----------

# Import RandomClassifier Algorithm
from pyspark.ml.classification import RandomForestClassifier

# Convert following feature column in one vector for train data
assembler = VectorAssembler(inputCols=[
    "display_id", "document_id", "platform", "ad_id", "campaign_id",
    "advertiser_id"
],
                            outputCol="normfeatures")
#assembler = VectorAssembler(inputCols = ["clicked"],outputCol="label")

#Normlize feature data
minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures")

#Convert Normlize feature data to vector
featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features")

#following Random forest algorithm train the classifiction model
dt = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            impurity="gini",
                            featureSubsetStrategy="auto",
                            numTrees=10,
                            maxDepth=30,
                            maxBins=128,
                            seed=1234)

# Following command will create pipeline with different stages
df.cache()

print("Creating Splits")
train, test = df.randomSplit([0.7, 0.3])

print("Selected Features Count: {0}".format(len(feature_cols)))
print("Selected Features: {0}".format(feature_cols))

print("Building Pipeline")
categorical_hasher = FeatureHasher(inputCols=categorical_cols,
                                   outputCol="categorical_features",
                                   categoricalCols=categorical_cols)
continuous_vector = VectorAssembler(inputCols=continuous_cols,
                                    outputCol="continuous_vector")
scaler = MinMaxScaler(min=0.0,
                      max=1.0,
                      inputCol=continuous_vector.getOutputCol(),
                      outputCol="continuous_features")
features = VectorAssembler(inputCols=feature_cols, outputCol="features")
bayes = NaiveBayes(smoothing=1.0,
                   featuresCol="features",
                   labelCol="HasDetections",
                   predictionCol="prediction",
                   modelType="multinomial")
pipeline = Pipeline(
    stages=[categorical_hasher, continuous_vector, scaler, features, bayes])
evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections",
                                              predictionCol="prediction",
                                              metricName="accuracy")

print("Configuring CrossValidation")
params = ParamGridBuilder() \
Exemplo n.º 9
0
def train_scaler(df, inputCol, outputCol):
    scaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol)
    return scaler.fit(df)
Exemplo n.º 10
0
df_cluster_pop = df_cluster_raw_pop.select(
    ith("features", lit(0)).alias('lon'),
    ith("features", lit(1)).alias('lat'),
    ith("features", lit(2)).alias('pop'))
df_cluster_granny = df_cluster_raw_granny.select(
    ith("features", lit(0)).alias('lon'),
    ith("features", lit(1)).alias('lat'),
    ith("features", lit(2)).alias('granny'))

# Iterating over columns to be scaled
for i in ["pop", "granny"]:
    # VectorAssembler Transformation - Converting column to vector type
    assembler = VectorAssembler(inputCols=[i], outputCol=i + "_Vect")

    # MinMaxScaler Transformation
    scaler = MinMaxScaler(inputCol=i + "_Vect", outputCol=i + "_Scaled")

    # Pipeline of VectorAssembler and MinMaxScaler
    pipeline = Pipeline(stages=[assembler, scaler])

    # Fitting pipeline on dataframe
    df_cluster = pipeline.fit(df_cluster).transform(df_cluster).withColumn(
        i + "_Scaled", unlist(i + "_Scaled")).drop(i + "_Vect")

df_cluster = df_cluster.select(df_cluster.lon, df_cluster.lat,
                               df_cluster.pop_Scaled.alias('pop'),
                               df_cluster.granny_Scaled.alias('granny'))

for row in df_cluster.collect():
    feature = {
        "type": "Feature",
Exemplo n.º 11
0
    s = x[1] + y[1]
    return (z, s)

spark = SparkSession \
    .builder \
    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()

# Getting the data structure and scaling
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vec_df = spark.createDataFrame(
    scalerModel.transform(trial_df).select("scaled_1").rdd.map(
        lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2]))))

# Create RowMatrix from the transpose of
spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd
vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect())
mat = RowMatrix(vector_df)
bun = mat.rows.collect()
num_clusters = 4

pre = sc.parallelize(mat.columnSimilarities().entries.map(
Exemplo n.º 12
0
               row["max_active"], row["std_active"], row["min_idle"],
               row["mean_idle"], row["max_idle"], row["std_idle"],
               row["sflow_fpackets"], row["sflow_fbytes"],
               row["sflow_bpackets"], row["sflow_bbytes"], row["fpsh_cnt"],
               row["bpsh_cnt"], row["furg_cnt"], row["burg_cnt"],
               row["total_fhlen"], row["total_bhlen"], row["dscp"]
           ]))
    return obj


fluxoRDD4 = fluxoDF.rdd.map(transformaVar)

fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"])

scaler = MinMaxScaler(inputCol="atributos",
                      outputCol="scaledFeatures",
                      min=0.0,
                      max=1.0)
scalerModel = scaler.fit(fluxoDF)
scaledData = scalerModel.transform(fluxoDF)

# Criando o modelo
#rfClassifer = RandomForestClassifier(labelCol = "rotulo", featuresCol = "scaledFeatures", probabilityCol = "probability", numTrees=20)
layers = [38, 5, 4, 2]
mlpClassifer = MultilayerPerceptronClassifier(labelCol="rotulo",
                                              featuresCol="scaledFeatures",
                                              maxIter=100,
                                              layers=layers,
                                              blockSize=128,
                                              seed=1234)
modelo = mlpClassifer.fit(scaledData)
Exemplo n.º 13
0
                                outputCol="features")

    assembled_train = assembler.transform(train_data)
    assembled_train.select("features", "PSSM_central_1_I").show(truncate=False)
    training_set = assembled_train.select("features", "PSSM_central_1_I")

    #Split de los datos
    train_final, test_final = training_set.randomSplit([0.80, 0.20], seed=13)
    train_final.describe().show()
    test_final.describe().show()
    train_final = train_final.selectExpr("PSSM_central_1_I as label",
                                         "features as features")
    test_final = test_final.selectExpr("PSSM_central_1_I as label",
                                       "features as features")

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(train_final)
    scaledTData = scalerModel.transform(train_final)
    scaledTData = scaledTData.select("label", "scaledFeatures")
    scaledTData = scaledTData.selectExpr("label as label",
                                         "scaledFeatures as features")

    scalerModel = scaler.fit(test_final)
    scaledFData = scalerModel.transform(test_final)
    scaledFData = scaledFData.select("label", "scaledFeatures")
    scaledFData = scaledFData.selectExpr("label as label",
                                         "scaledFeatures as features")

    #Clasificador 2
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
Exemplo n.º 14
0
# (u'Cops (1922)', 5.46740481439733)

# We noticed that out top ranked movies have ratings higher than 5. This makes  sense as there is no ceiling
# implied in our algorithm and one can imagine that certain combinations of factors would combine to create
# “better than anything you’ve seen yet” ratings.
# Nevertheless, we may have to constrain our ratings to a 1-5 range.

### SCALE PREDICTED RATINGS WITHIN DEFINED BOUNDS
new_user_recommendations_formatted_RDD_DF = new_user_recommendations_formatted_RDD.toDF(
    ['movie', "rating"])
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
new_user_recommendations_formatted_RDD_DF = new_user_recommendations_formatted_RDD_DF.select(
    "movie",
    to_vector("rating").alias("rating"))
scaler = MinMaxScaler(inputCol="rating",
                      outputCol="scaled_rating",
                      min=1,
                      max=5)
model = scaler.fit(new_user_recommendations_formatted_RDD_DF)
new_user_recommendations_formatted_RDD_DF_scaled = model.transform(
    new_user_recommendations_formatted_RDD_DF)
print("Features scaled to range: [%f, %f]" %
      (scaler.getMin(), scaler.getMax()))
# Features scaled to range: [1.000000, 5.000000]

new_user_recommendations_formatted_RDD_DF_scaled.select(
    "rating", "scaled_rating").show()
# +--------------------+--------------------+
# |              rating|       scaled_rating|
# +--------------------+--------------------+
# |[1.8833597779874536]| [2.810434087306585]|
# |[2.4494414977308594]|[3.0641436235844264]|
Exemplo n.º 15
0
               "bars_confidence_max", "beats_confidence_max", "bars_start_max",
               "segments_confidence_max", "segments_loudness_max_time_max",
               "tatums_confidence_max",
               "bars_confidence_min",
               "beats_confidence_min", "bars_start_min",
               "segments_confidence_min", "segments_loudness_max_time_min",
               "tatums_confidence_min"]

    assembler = VectorAssembler(inputCols=columns, outputCol="raw_features").setHandleInvalid("skip")

    df_scale = assembler.transform(feature_selector).select('label', 'raw_features')
    # Most classifiers use some form of a distance calculation and each numeric feature tends to have different
    # ranges, some more broad than others. Scaling these features helps ensure that each feature’s contribution is
    # weighted proportionally.
    # https://albertdchiu.medium.com/a-step-by-step-example-in-binary-classification-5dac0f1ba2dd
    scaler = MinMaxScaler(inputCol="raw_features", outputCol="scaled_features")
    scalerModel = scaler.fit(df_scale)
    df_scale = scalerModel.transform(df_scale).select('label', 'scaled_features').persist(
        pyspark.StorageLevel.DISK_ONLY)

    print("\n\nSanity check counter ", df_scale.count())
    total_count = df_scale.count()
    zero_counter = df_scale.filter(col('label') == 0).count()
    ones_counter = df_scale.filter(col('label') == 1).count()
    print("Count 1s :", ones_counter)
    print("Count 0s", zero_counter)
    print("Sanity check sum 1s and 0s", zero_counter + ones_counter)

    # Not the best weight method. The
    # if(zero_counter > ones_counter):
    #     print("More zeros!")
f27 = f26.drop('Cat9_1')
f28 = f27.drop('Cat10_1')
f29 = f28.drop('Cat11_1')
f30 = f29.drop('Cat12_1')
f31 = f30.drop('NVCat_1')
df2 = f31.selectExpr("Claim_Amount as label","feature as features")


assembler1 = VectorAssembler(
    inputCols=["label"],
    outputCol="label1")

output1 = assembler1.transform(df2)
output1 = output1.cache()
f32 = output1.drop("label")
scaler = MinMaxScaler(inputCol="label1", outputCol="label")
scalerModel = scaler.fit(f32)
scaledData = scalerModel.transform(f32)
element=udf(lambda v:float(v[0]),FloatType())
new = scaledData.withColumn('label', element('label'))
(trainingData, testData) = new.randomSplit([0.7, 0.3], 50)
lr = LinearRegression(featuresCol = "features", labelCol = "label", maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainingData)
trainingSummary = lrModel.summary
print("Question 2.2(a).............")
print("RMSE for training data: %f" % trainingSummary.rootMeanSquaredError)
predict = lrModel.transform(testData)
evaluator_rmse = RegressionEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predict)
print("RMSE for test data = %g " % rmse)
Exemplo n.º 17
0
    def scale_vec_col(self, columns, name_output_col):
        """
        This function groups the columns specified and put them in a list array in one column, then a scale
        process is made. The scaling proccedure is spark scaling default (see the example
        bellow).

        +---------+----------+
        |Price    |AreaLiving|
        +---------+----------+
        |1261706.9|16        |
        |1263607.9|16        |
        |1109960.0|19        |
        |978277.0 |19        |
        |885000.0 |19        |
        +---------+----------+

                    |
                    |
                    |
                    V
        +----------------------------------------+
        |['Price', 'AreaLiving']                 |
        +----------------------------------------+
        |[0.1673858972637624,0.5]                |
        |[0.08966137157852398,0.3611111111111111]|
        |[0.11587093205757598,0.3888888888888889]|
        |[0.1139820728616421,0.3888888888888889] |
        |[0.12260126542983639,0.4722222222222222]|
        +----------------------------------------+
        only showing top 5 rows

        """

        # Check if columns argument must be a string or list datatype:
        self._assert_type_str_or_list(columns, "columns")

        # Check if columns to be process are in dataframe
        self._assert_cols_in_df(columns_provided=columns,
                                columns_df=self._df.columns)

        # Check if name_output_col argument a string datatype:
        self._assert_type_str(name_output_col, "nameOutpuCol")

        # Model to use vectorAssember:
        vec_assembler = VectorAssembler(inputCols=columns,
                                        outputCol="features_assembler")
        # Model for scaling feature column:
        mm_scaler = MinMaxScaler(inputCol="features_assembler",
                                 outputCol=name_output_col)
        # Dataframe with feature_assembler column
        temp_df = vec_assembler.transform(self._df)
        # Fitting scaler model with transformed dataframe
        model = mm_scaler.fit(temp_df)

        exprs = list(filter(lambda x: x not in columns, self._df.columns))

        exprs.extend([name_output_col])

        self._df = model.transform(temp_df).select(*exprs)
        self._add_transformation()  # checkpoint in case

        return self
Exemplo n.º 18
0
    spark = (SparkSession.builder.appName(
        "test app").enableHiveSupport().getOrCreate())

    start_date = "2004-02-10 03:12:39"
    end_date = '2004-02-20 03:12:39'

    all_data = spark.sql("select * from demo.bearing where idx_date >= '%s' and idx_date < '%s'" \
                         % (start_date, end_date))
    columns = all_data.columns

    # create scaled data
    tmp_data = all_data.rdd.map(lambda x:
                                (x[0], Vectors.dense(x[1:]))).collect()
    scale_df = spark.createDataFrame(tmp_data, ['idx_date', '_features'])

    scaler = MinMaxScaler(inputCol="_features", outputCol="features")
    scalerModel = scaler.fit(scale_df)
    scaledData = scalerModel.transform(scale_df)

    train_data = scaledData.select(
        "idx_date", "features").filter("idx_date <= '2004-02-15 12:52:39'")
    test_data = scaledData.select("idx_date", "features").filter("idx_date >= '%s'" % start_date) \
        .filter("idx_date <= '%s'" % end_date)

    iforest = IForest(contamination=0.1,
                      maxFeatures=1.0,
                      maxSamples=256,
                      bootstrap=True)
    model = iforest.fit(train_data)
    model.hasSummary
    summary = model.summary
df.select([count(when(col(c).isNull(), c)).alias(c)
           for c in df.columns]).show()

# Normalize Data

# Normalize columns
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# MinMaxScaler Transformation
assembler = VectorAssembler(
    inputCols=["ups", "downs", "authorlinkkarma", "authorkarma"],
    outputCol="vector").setParams(handleInvalid="skip")
scaler = MinMaxScaler(min=0.0,
                      max=1.0,
                      inputCol="vector",
                      outputCol="vector_scaled")

pipeline = Pipeline(stages=[assembler, scaler])

scalerModel = pipeline.fit(df)
scaledData = scalerModel.transform(df)

#vector_scaled is out normalized data:
vactorData = scaledData.select("vector", "vector_scaled")

# when using azure databricks, use this call to visualize the data
#display(scaledData.select("vector_scaled"))

# ,Data Sampling for Experimentation
df = spark.read.parquet(dbfs_mnt_processed + 'redditcomments/')
Exemplo n.º 20
0
input_labelled_points = input.map(convert_to_labeled_point)
print '**************Converted to labeled point************* \n', input_labelled_points.take(
    5)
'''
Part 3
- Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances.
- Normalize features between 0 and 1 with 1 representing darkest shade in heat map.
Hint: https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler
'''

lines = input.map(lambda line: line.split(','))
transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:])))
labelled_dataframe = sqlContext.createDataFrame(transformed,
                                                ["label", "features"])
scalar = MinMaxScaler(inputCol="features", outputCol="features_scaled")
scalar_mod = scalar.fit(labelled_dataframe.limit(40))
scaled_data = scalar_mod.transform(labelled_dataframe)
print '******Scaled Features******* : \n', scaled_data.show(5, False)

heat1 = np.asarray(
    labelled_dataframe.rdd.map(
        lambda f: (float(f.features[1]), float(f.features[2]))).take(40))
plt.imshow(heat1, cmap='gray')
plt.show()

heat2 = np.asarray(
    scaled_data.rdd.map(lambda f:
                        (float(f.features[1]), float(f.features[2]))).take(40))
plt.imshow(heat2, cmap='gray')
plt.show()
Exemplo n.º 21
0
Arquivo: app.py Projeto: mledl/BDMA_HW
# Data preprocessing
df = prepocess_data(df)

# Calculating statistics
global_active_power_stat = get_basic_statistics(df, "Global_active_power")
global_reactive_power_stat = get_basic_statistics(df, "Global_reactive_power")
voltage_stat = get_basic_statistics(df, "Voltage")
global_intensity_stat = get_basic_statistics(df, "Global_intensity")

# Calculating Min-max normalization
assembler = VectorAssembler(inputCols=df.columns[0:], outputCol="features")
df_2 = assembler.transform(df)

scaler = MinMaxScaler(min=0,
                      max=1,
                      inputCol='features',
                      outputCol='features_minmax')
scaler_model = scaler.fit(df_2)
df_3 = scaler_model.transform(df_2)

# Transforming Dense vector to dataframe
min_max_df = df_3.rdd.map(
    lambda x: [float(y) for y in x['features_minmax']]).toDF(df.columns[0:])

# create files and print
print_statistics([
    global_active_power_stat, global_reactive_power_stat, voltage_stat,
    global_intensity_stat
], min_max_df)

# for local testing
Exemplo n.º 22
0
print('Training dataset size: {}'.format(train_dataCount))
print('Validation dataset size: {}'.format(validationDataCount))
print('Test dataset size: {}'.format(test_dataCount))

print('Training + Validation + Test  = {}'.format(train_dataCount +
                                                  validationDataCount +
                                                  test_dataCount))

####################################################################################
## part 2
print('*' * 100)
print('Part 2 - Train the model and evaluate on validation dataset \n')

# data processing pipeline
assembler = VectorAssembler(inputCols=features, outputCol='unscaledFeatures')
minMaxScaler = MinMaxScaler(inputCol='unscaledFeatures', outputCol='features')
stages = [assembler, minMaxScaler]
pipeline = Pipeline(stages=stages)

procPipeline = pipeline.fit(train_data)
train_data = procPipeline.transform(train_data)
validationData = procPipeline.transform(validationData)
test_data = procPipeline.transform(test_data)

train_data = train_data.select('label', 'features')
validationData = validationData.select('label', 'features')
test_data = test_data.select('label', 'features')

# train model and evaluate on validation data
lr = LinearRegression(maxIter=100)
model = lr.fit(train_data)
Exemplo n.º 23
0
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("FMRegressorExample") \
        .getOrCreate()

    # $example on$
    # Load and parse the data file, converting it to a DataFrame.
    data = spark.read.format("libsvm").load(
        "data/mllib/sample_libsvm_data.txt")

    # Scale features.
    featureScaler = MinMaxScaler(inputCol="features",
                                 outputCol="scaledFeatures").fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a FM model.
    fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001)

    # Create a Pipeline.
    pipeline = Pipeline(stages=[featureScaler, fm])

    # Train model.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)
Exemplo n.º 24
0
        inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip"
    ).transform(df)
    temp_normalized_vector_col = temp_col_name(assembled)

    trained_parameters = load_trained_parameters(
        trained_parameters, {"input_column": input_column, "min": min, "max": max,}
    )

    scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
        trained_parameters, MinMaxScalerModel, "scaler_model"
    )

    if scaler_model is None:
        scaler = MinMaxScaler(
            inputCol=temp_vector_col,
            outputCol=temp_normalized_vector_col,
            min=parse_parameter(float, min, "min", 0.0),
            max=parse_parameter(float, max, "max", 1.0),
        )
        scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans)

    output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded)

    # convert the resulting vector back to numeric
    temp_flattened_vector_col = temp_col_name(output_df)
    output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col))

    # keep only the final scaled column.
    output_column = input_column if output_column is None or not output_column else output_column
    output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column)
    output_df = output_df.withColumn(output_column, output_column_value)
    final_columns = list(dict.fromkeys((list(df.columns) + [output_column])))
Exemplo n.º 25
0
  .getOrCreate()

df = sql.read \
 .format("csv") \
 .option("sep", ",") \
 .option("inferSchema", "true") \
 .option("header", "true") \
 .load(train_path)

# Datetime

dt_trans = DateColumns(inputCol="click_time")
dt_ass = VectorAssembler(inputCols=dt_trans.getOutputColumns(),
                         outputCol="dt_cols",
                         handleInvalid="skip")
dt_minmax = MinMaxScaler(inputCol="dt_cols", outputCol="dt_scaled")

dt_pipeline = Pipeline(stages=[dt_trans, dt_ass, dt_minmax])

cond_cols = ["cond_app", "cond_device", "cond_os", "cond_channel"]

cond_app = Conditional(inputCol=TARGET,
                       groupByCol=["app"],
                       outputCol="cond_app")
cond_device = Conditional(inputCol=TARGET,
                          groupByCol=["device"],
                          outputCol="cond_device")
cond_os = Conditional(inputCol=TARGET, groupByCol=["os"], outputCol="cond_os")
cond_channel = Conditional(inputCol=TARGET,
                           groupByCol=["channel"],
                           outputCol="cond_channel")
Exemplo n.º 26
0
from pyspark.ml.feature import MinMaxScaler
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
data = sqlContext.read.format("libsvm").load("D:\Spark\spark-1.6.1-bin-hadoop2.6\data\mllib\sample_libsvm_data.txt")
indexer = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
indexerData = indexer.fit(data)
indexedData = indexerData.transform(data)
indexedData.show()
"""OUTPUT

+-----+--------------------+--------------------+
|label|            features|      scaledFeatures|
+-----+--------------------+--------------------+
|  0.0|(692,[127,128,129...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[158,159,160...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[124,125,126...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[152,153,154...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[151,152,153...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[129,130,131...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[158,159,160...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[99,100,101,...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[127,128,129...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[153,154,155...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[151,152,153...|[0.5,0.5,0.5,0.5,...|
|  1.0|(692,[129,130,131...|[0.5,0.5,0.5,0.5,...|
|  0.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...|
#df = scaler.transform(df)

df.select(["id", "ScaledNumFeatures"]).where(df.Id == "512").collect()

# # Question 4
# Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

# In[27]:

scaler2 = StandardScaler(inputCol="TitleAndBodyLengthVector",
                         outputCol="ScaledNumFeatures2",
                         withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)
df.select(["id", "ScaledNumFeatures2"]).where(df.Id == "512").collect()

# # Question 5
# Using the MinMAxScaler method what's the normalized value for question Id = 512?

# In[29]:

from pyspark.ml.feature import MinMaxScaler
scaler3 = MinMaxScaler(inputCol="TitleAndBodyLengthVector",
                       outputCol="ScaledNumFeatures3")
scalerModel3 = scaler3.fit(df)
df = scalerModel3.transform(df)

df.select(["id", "ScaledNumFeatures3"]).where(df.Id == "512").collect()

# In[ ]:
Exemplo n.º 28
0
            #'ARRIVAL_TIME',
            'ARRIVAL_DELAY',
            #'DIVERTED',
            #'CANCELLED',
            #'CANCELLATION_REASON',
            #'AIR_SYSTEM_DELAY',
            #'SECURITY_DELAY',
            #'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY'
  ],
    outputCol="features")




#Normalizamos los features
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")

#aplicamos pca para reducir dimensionalidad
pca = PCA(k=7, inputCol="scaled_features", outputCol="pcaFeatures")

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor,GeneralizedLinearRegression

#Hacemos un udf para generar un tipo diccionario de modelos y de Paramgridbuilders respectivamente y estos
#se iteren con sus respectivos paraétros.
def define_hyper_params():

    #Creamos un diccionario de los modelos
    modelo = {'dt': DecisionTreeRegressor(featuresCol="pcaFeatures",labelCol='DEPARTURE_DELAY'),
Exemplo n.º 29
0
# def quintile_agg(df_in,gr,colm):
#     qua=df_in.groupBy(gr).agg(*[mean(F.col(i)) for i in colm]).sort(F.col(gr))
#     return qua
# quintile_grouped = quintile_agg(df_input,grp,num_cols)
# quintile_grouped.show(5)
# # quintile_grouped.toPandas().to_csv('quintile_grouped.csv',index=False)#output_dir+'quintile_grouped.csv')

## prepare the data in vector dense
from pyspark.ml.linalg import Vectors
def transData(data):
    return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1:])]).toDF(['CustomerID','rfm']) #Return a new RDD by applying a function to each element of this RDD.
transformed=transData(rfm)
transformed.show(5)
## normalization
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(inputCol="rfm",outputCol="features")
scalerModel = scaler.fit(transformed)
scaledData = scalerModel.transform(transformed)
scaledData.show(5,False) # results will not be truncated
scalerModel.save('filepath/scaling')
###ML
## find optimal parameter
from pyspark.ml.clustering import KMeans
cost = np.zeros(10)
for k in range(2,10):
    kmeans = KMeans().setK(k)\
        .setSeed(1) \
        .setFeaturesCol("features")\
        .setPredictionCol("cluster")
    model = kmeans.fit(scaledData)
    cost[k] = model.computeCost(scaledData)
Exemplo n.º 30
0
    def preprocess_data(self):
        rawDataDF = GetData().get_input_data()

        assembler = VectorAssembler(inputCols=["age"], outputCol="features")
        outputDF = assembler.transform(rawDataDF)

        outputDF = outputDF.drop('age')

        scaler = MinMaxScaler(inputCol="features", outputCol="scaled_age")
        scalerModel = scaler.fit(outputDF.select("features"))
        scaledDF = scalerModel.transform(outputDF)
        scaledDF = scaledDF.drop('features')

        udf1 = udf(lambda x: float(x[0]), FloatType())
        scaledDF = scaledDF.withColumn("scaled_age", udf1(col('scaled_age')))

        indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex")
        indexedDF = indexer.fit(scaledDF).transform(scaledDF)
        indexedDF = indexedDF.drop('sex')

        indexer = StringIndexer(inputCol="address",
                                outputCol="indexed_address")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('address')

        indexer = StringIndexer(inputCol="Pstatus",
                                outputCol="indexed_Pstatus")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('Pstatus')

        indexer = StringIndexer(inputCol="famsize",
                                outputCol="indexed_famsize")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('famsize')

        indexer = StringIndexer(inputCol="guardian",
                                outputCol="indexed_guardian")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('guardian')

        indexer = StringIndexer(inputCol="schoolsup",
                                outputCol="indexed_schoolsup")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('schoolsup')

        indexer = StringIndexer(inputCol="famsup", outputCol="indexed_famsup")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('famsup')

        indexer = StringIndexer(inputCol="romantic",
                                outputCol="indexed_romantic")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('romantic')

        indexer = StringIndexer(inputCol="internet",
                                outputCol="indexed_internet")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('internet')

        indexer = StringIndexer(inputCol="higher", outputCol="indexed_higher")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('higher')

        indexer = StringIndexer(inputCol="nursery",
                                outputCol="indexed_nursery")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('nursery')

        indexer = StringIndexer(inputCol="activities",
                                outputCol="indexed_activities")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('activities')

        indexer = StringIndexer(inputCol="Mjob", outputCol="indexed_Mjob")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('Mjob')

        indexer = StringIndexer(inputCol="Fjob", outputCol="indexed_Fjob")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('Fjob')

        indexer = StringIndexer(inputCol="paid", outputCol="indexed_paid")
        indexedDF = indexer.fit(indexedDF).transform(indexedDF)
        indexedDF = indexedDF.drop('paid')
        indexedDF = indexedDF.drop("school", 'reason')

        return indexedDF
Exemplo n.º 31
0
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import MinMaxScaler
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="MinMaxScalerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    dataFrame = sqlContext.read.format("libsvm").load(
        "data/mllib/sample_libsvm_data.txt")

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    sc.stop()
Exemplo n.º 32
0
 def __init__(self, inputCol, outputCol, s_min=0, s_max=0):
     self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol)
     self.mmModel.setMin(s_min)
     self.mmModel.setMax(s_max)
     self.in_column = inputCol
Exemplo n.º 33
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import MinMaxScaler
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="MinMaxScalerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    sc.stop()
Exemplo n.º 34
0
    # Cambiamos en nombre de la columna objetivo y agrupamos las columnas de caracteristicas en una sola columna de arrays.
    train_set = train_set.withColumnRenamed("_c631", "label")
    assembler = VectorAssembler(
        inputCols=['_c186', '_c245', '_c459', '_c221', '_c490', '_c429'],
        outputCol='features')
    train_set = assembler.transform(train_set)

    # Lo mismo para el conjunto de test
    test_set = test_set.withColumnRenamed("_c631", "label")
    assembler = VectorAssembler(
        inputCols=['_c186', '_c245', '_c459', '_c221', '_c490', '_c429'],
        outputCol='features')
    test_set = assembler.transform(test_set)

    # Añadimos una columna con las caracteristicas escaladas entre 0 y 1
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    scalerModelTrain = scaler.fit(train_set)
    train_set = scalerModelTrain.transform(train_set)
    scalerModelTest = scaler.fit(test_set)
    test_set = scalerModelTrain.transform(test_set)

    ###### Entrenamiento de los modelos
    ### Regresion logistica 1
    # Entrenamiento
    lr = LogisticRegression(maxIter=10,
                            regParam=0.3,
                            elasticNetParam=0.8,
                            family="binomial")
    lrModel_1 = lr.fit(train_set)

    # Curva ROC
Exemplo n.º 35
0
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("MinMaxScalerExample")\
        .getOrCreate()

    # $example on$
    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.1, -1.0]),),
        (1, Vectors.dense([2.0, 1.1, 1.0]),),
        (2, Vectors.dense([3.0, 10.1, 3.0]),)
    ], ["id", "features"])

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
    scaledData.select("features", "scaledFeatures").show()
    # $example off$

    spark.stop()
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct