def process(spark, train_data, test_data):
    df_train = spark.read.parquet(train_data)
    df_test = spark.read.parquet(test_data)

    features = VectorAssembler(inputCols=df_train.columns[1:-1],
                               outputCol='features')
    evaluator = RegressionEvaluator(labelCol='ctr',
                                    predictionCol='prediction',
                                    metricName='rmse')
    lr_model_base = LinearRegression(labelCol='ctr', **LR_PARAMS_BASE)
    lr_model_to_tune = LinearRegression(labelCol='ctr')

    lr_param_grid = ParamGridBuilder() \
        .addGrid(lr_model_to_tune.maxIter, [5, 10, 20, 40, 50]) \
        .addGrid(lr_model_to_tune.regParam, [0.4, 0.1, 0.01, 0.001]) \
        .addGrid(lr_model_to_tune.fitIntercept, [False, True]) \
        .addGrid(lr_model_to_tune.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \
        .build()

    tvs = TrainValidationSplit(estimator=lr_model_to_tune,
                               estimatorParamMaps=lr_param_grid,
                               evaluator=evaluator,
                               trainRatio=0.8)

    pipeline_model_base = Pipeline(
        stages=[features, lr_model_base]).fit(df_train)
    prediction_base = pipeline_model_base.transform(df_test)
    rmse_base = evaluator.evaluate(prediction_base)
    print(f'Base lr model params: {LR_PARAMS_BASE}')
    print(f'RMSE at base lr model = {rmse_base}')

    print('Tuning lr model...')
    pipeline_model_tuned = Pipeline(stages=[features, tvs]).fit(df_train)
    prediction_tuned = pipeline_model_tuned.transform(df_test)
    rmse_tuned = evaluator.evaluate(prediction_tuned)

    model_java_obj = pipeline_model_tuned.stages[-1].bestModel._java_obj
    lr_params_tuned = {
        'maxIter': model_java_obj.getMaxIter(),
        'regParam': model_java_obj.getRegParam(),
        'elasticNetParam': model_java_obj.getElasticNetParam(),
        'fitIntercept': model_java_obj.getFitIntercept()
    }

    print(f'Base lr model params: {lr_params_tuned}')
    print(f'RMSE at tuned lr model = {rmse_tuned}')

    if rmse_tuned < rmse_base:
        pipeline_model_tuned.write().overwrite().save(MODEL_PATH)
        print(f'Tuned model has better RMSE value')
    else:
        pipeline_model_base.write().overwrite().save(MODEL_PATH)
        print(f'Base model has better RMSE value')
    print(f'Model saved at "{MODEL_PATH}"')

    spark.stop()
예제 #2
0
def transform_data_in_pipeline(df):
    """

    :param df:
    :return:
    """

    # Initialise pipeline variables
    stages = []
    assembler_inputs = []

    # Assemble features vector from Spark dataframe fields
    assembler = VectorAssembler(
        inputCols=['x', 'y', 'star_rating_number', 'avg_adr'],
        outputCol='features')
    stages += [assembler]
    assembler_inputs += [assembler.getOutputCol()]

    # Apply standard scaling with unit std and centroid about the mean
    scaler = StandardScaler(inputCol=assembler.getOutputCol(),
                            outputCol='scaledFeatures')
    stages += [scaler]
    assembler_inputs += [scaler.getOutputCol()]

    # Execute the pipeline
    pipeline_model = Pipeline() \
        .setStages(stages) \
        .fit(df)

    # Return the dataframe with the additional transformed features vector
    return pipeline_model.transform(df)
def date_conversion():
    df = sql.read.csv("./run/date_test_res.csv", inferSchema=True, header=True)
    datetime_formatting = DatetimeFormatting()
    model = Pipeline(stages=[datetime_formatting]).fit(df)
    res = model.transform(df)
    print("resulted_df")
    print(res.show())
def remove_skewness():
    df = sql.read.csv("./run/file1.csv", inferSchema=True, header=True)

    min_skewness = MinimizeSkewness(['Purpose'])
    model = Pipeline(stages=[min_skewness]).fit(df)
    res = model.transform(df)
    print(res.show())
예제 #5
0
def process_df(df):
    time_seq.append(['start process-df', time.time()])
    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="instruments",
                       outputCol="instruments_tokenized",
                       minTokenLength=1),
        NGram(n=1,
              inputCol="instruments_tokenized",
              outputCol="instruments_ngrams"),
        HashingTF(inputCol="instruments_ngrams",
                  outputCol="instruments_vectors"),
        MinHashLSH(inputCol="instruments_vectors",
                   outputCol="instruments_lsh",
                   numHashTables=10)
    ]).fit(df)

    df_hashed = model.transform(df)
    df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \
        .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \
        .select(f.col('datasetA.filename').alias('filename_A'),
                f.col('datasetB.filename').alias('filename_B'),
                f.col('distance'))
    time_seq.append(['process-df df_matches', time.time()])
    write_df_to_pgsql(df_matches, 'filepair_similarity_run3')
    time_seq.append(['write pgsql', time.time()])
    print('time_seq', time_seq)
예제 #6
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
예제 #7
0
def train_als(ratings_data, split_prop, max_iter, reg_param, rank, cold_start_strategy):
    seed = 42

    spark = pyspark.sql.SparkSession.builder.getOrCreate()

    ratings_df = spark.read.parquet(ratings_data)
    (training_df, test_df) = ratings_df.randomSplit([split_prop, 1 - split_prop], seed=seed)
    training_df.cache()
    test_df.cache()

    mlflow.log_metric("training_nrows", training_df.count())
    mlflow.log_metric("test_nrows", test_df.count())

    print("Training: {0}, test: {1}".format(training_df.count(), test_df.count()))

    als = (
        ALS()
        .setUserCol("userId")
        .setItemCol("movieId")
        .setRatingCol("rating")
        .setPredictionCol("predictions")
        .setMaxIter(max_iter)
        .setSeed(seed)
        .setRegParam(reg_param)
        .setColdStartStrategy(cold_start_strategy)
        .setRank(rank)
    )

    als_model = Pipeline(stages=[als]).fit(training_df)

    reg_eval = RegressionEvaluator(predictionCol="predictions", labelCol="rating", metricName="mse")

    predicted_test_dF = als_model.transform(test_df)

    test_mse = reg_eval.evaluate(predicted_test_dF)
    train_mse = reg_eval.evaluate(als_model.transform(training_df))

    print("The model had a MSE on the test set of {0}".format(test_mse))
    print("The model had a MSE on the (train) set of {0}".format(train_mse))
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("train_mse", train_mse)
    mlflow.spark.log_model(als_model, "als-model")
def remove_url_duplication():
    df = sql.read.csv("./run/date_test_res.csv", inferSchema=True, header=True)

    url_duplication = RemovingDuplicationUrl()
    model = Pipeline(stages=[url_duplication]).fit(df)
    result = model.transform(df)

    result.toPandas().to_csv('./run/pipeline_url.csv')

    print("resulted_df")
    print(result.show())
예제 #9
0
def main():
    input_dataset = sys.argv[1]
    output_dir = sys.argv[2]

    start_time = time.time()

    #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text')

    stackoverflow_df = sqlContext.read.csv(input_dataset,
                                           header=True).toDF('id', 'text')

    # stackoverflow_df.show()

    # stackoverflow_df.head(10).show()

    # stack_df = stack_rdd.toDF(['id','text'])

    # stackoverflow_df.show()

    # stackoverflow_df.printSchema()

    model = Pipeline(stages=[
        RegexTokenizer(
            pattern="", inputCol="text", outputCol="tokens", minTokenLength=1),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors"),
        MinHashLSH(
            inputCol="vectors", outputCol="lsh"
        )  #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5)
    ]).fit(stackoverflow_df)

    db_hashed = model.transform(stackoverflow_df)

    # db_hashed.show()
    # query_hashed = model.transform(query)

    # db_hashed.show()
    # query_hashed.show()

    #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id")

    res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed,
                                                0.70).filter("distCol > 0")

    #print res

    #print res.count()

    res.show()

    elapsed_time = time.time() - start_time

    print 'Elapsed Time ==> ', elapsed_time
예제 #10
0
    def model(self, pandas, column):
        rdd = self.sqlctx.createDataFrame(pandas.astype(str))
        model = Pipeline(stages=[
            Tokenizer(inputCol=column, outputCol="tokens"),
            StopWordsRemover(inputCol='tokens',
                             outputCol="tokens_stop",
                             stopWords=self.STOP_WORDS),
            HashingTF(inputCol="tokens_stop", outputCol="vectors")
        ]).fit(rdd)

        db_1 = model.transform(rdd)
        db_1.cache()
        return db_1
예제 #11
0
    def test_serialize_to_bundle(self):
        string_map = StringMap(
            labels={'a': 1.0},
            inputCol='key_col',
            outputCol='value_col',
        )
        pipeline = Pipeline(stages=[string_map]).fit(self.input)
        serialization_dataset = pipeline.transform(self.input)

        jar_file_path = _serialize_to_file(pipeline, serialization_dataset)
        deserialized_pipeline = _deserialize_from_file(jar_file_path)

        result = deserialized_pipeline.transform(self.input)
        expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
        assert_df(expected, result)
def columns_same_value():
    try:
        df = sql.read.csv("./run/rem_test.csv", inferSchema=True, header=True)

        columns_with_same_val = ColumnsDroppingSameValue()
        model = Pipeline(stages=[columns_with_same_val]).fit(df)
        result = model.transform(df)

        result.toPandas().to_csv('./run/pipeline_same_value.csv')
        print(df.show())
        print("#####################")
        print("resulted_df")
        print(result.show())
    except Exception as e:
        logger.error(e)
예제 #13
0
def Indexer(spark,
            train_address,
            val_address,
            tst_address,
            repartition_size=10000):

    df_train = spark.read.parquet(train_address)
    df_val = spark.read.parquet(val_address)
    df_test = spark.read.parquet(tst_address)

    # user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train)
    # track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train.union(df_val))

    # df_train = user_indexer.transform(df_train)
    # df_train = track_indexer.transform(df_train)
    # df_val = user_indexer.transform(df_val)
    # df_val = track_indexer.transform(df_val)
    # df_train = df_train.select("user_id_numeric","track_id_numeric","count")
    # df_val = df_val.select("user_id_numeric","track_id_numeric","count")
    # df_test = user_indexer.transform(df_test)
    # df_test = track_indexer.transform(df_test)

    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_numeric")
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_numeric")

    # fit df_train only
    model = Pipeline(stages=[user_indexer, track_indexer]).fit(df_train)
    df_train, df_val, df_test = [
        model.transform(x) for x in (df_train, df_val, df_test)
    ]

    df_train = df_train.select("user_id_numeric", "track_id_numeric", "count")
    df_val = df_val.select("user_id_numeric", "track_id_numeric", "count")
    df_test = df_test.select("user_id_numeric", "track_id_numeric", "count")

    # df_train = df_train.repartition(repartition_size,"user_id_numeric","track_id_numeric")
    # df_val = df_val.repartition(repartition_size,"user_id_numeric","track_id_numeric")
    # df_test = df_test.repartition(repartition_size,"user_id_numeric","track_id_numeric")

    # df_train.write.parquet("./train_formatted.parquet", mode='overwrite')
    # df_val.write.parquet("./val_formatted.parquet", mode='overwrite')
    # df_test.write.parquet("./test_formatted.parquet", mode='overwrite')

    print('Indexer succeed.')
    return df_train, df_val, df_test
예제 #14
0
def main():
    potential_clones = sys.argv[1]
    outDir = sys.argv[2]

    start_time = time.time()

    potential_clones = '../Datasource/pc.xml'
    output_csv = 'csvCodes.csv'
    df = convertAndSaveAsCSV(potential_clones, output_csv, True)

    # spark context
    sc = SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)
    spark_df = sqlContext.createDataFrame(df)

    transformed_spark_df = spark_df.rdd.map(distributedSourceTransform)

    pysparkdf_transformedClones = transformed_spark_df.toDF(
        ['filepath', 'startline', 'endline', 'source'])

    #pysparkdf_transformedClones.show()

    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="source",
                       outputCol="tokens",
                       minTokenLength=1),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors", numFeatures=262144),
        MinHashLSH(
            inputCol="vectors", outputCol="lsh", numHashTables=105
        )  #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5)
    ]).fit(pysparkdf_transformedClones)

    hashed_clones = model.transform(pysparkdf_transformedClones)

    clone_pairs = model.stages[-1].approxSimilarityJoin(
        hashed_clones, hashed_clones, 0.70).filter("distCol > 0")

    clone_pairs.show()

    elapsed_time = time.time() - start_time

    print 'Elapsed Time ==> ', elapsed_time
예제 #15
0
def main(argv):
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()

    features_df = ParquetDataFrame(
        f'data/processed/{Phase.train.name}/features', spark)
    test_data_frac = 0.1
    test_features_df, train_features_df = features_df.randomSplit(
        [test_data_frac, 1 - test_data_frac])
    label_col = 'duration_min'
    model = Pipeline(stages=[
        StringIndexer(inputCol='pickup_cell_6',
                      handleInvalid='keep',
                      outputCol='pickup_cell_6_idx'),
        StringIndexer(inputCol='dropoff_cell_6',
                      handleInvalid='keep',
                      outputCol='dropoff_cell_6_idx'),
        VectorAssembler(inputCols=[
            'pickup_cell_6_idx', 'dropoff_cell_6_idx', 'distance', 'month',
            'day_of_month', 'day_of_week', 'hour', 'requests_pickup_cell',
            'requests_dropoff_cell'
        ],
                        outputCol="features"),
        DecisionTreeRegressor(
            maxDepth=7, featuresCol='features', labelCol=label_col)
    ]).fit(train_features_df)

    model_path = 'model/trip_duration_min'
    print(f'Saving model to {model_path}')
    model.write().overwrite().save(model_path)
    print(f'Model saved...')

    model = PipelineModel.load(model_path)
    predictions_df = model.transform(test_features_df)
    mae_cv = RegressionEvaluator(labelCol=label_col,
                                 metricName='mae').evaluate(predictions_df)
    print(f'Mean absolutre error: {mae_cv}')

    spark.stop()
def remove_cols_containing_nan():
    try:

        logger.debug("this is debug")
        df = sql.read.csv("./run/column_rem.csv",
                          inferSchema=True,
                          header=True)

        col_contains_nan = ColumnsDroppingContainsNan()
        model = Pipeline(stages=[col_contains_nan]).fit(df)
        result = model.transform(df)

        result.toPandas().to_csv('./run/pipeline_nan_value.csv')
        print(df.show())
        print("#####################")
        print("resulted_df")
        print(result.show())

    except Exception as e:
        logger.error(e)
예제 #17
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(maxIter=10,
                             regParam=0.3,
                             elasticNetParam=0.8,
                             family="multinomial",
                             labelCol="indexedLabel",
                             featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[vector, labelIndex, clf]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

from pyspark.ml.feature import CountVectorizer

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")

# we now create a pipelined transformer
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(review)
cv_pipeline.transform(review).show(5)

from pyspark.ml.feature import IDF
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')

idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(review)

tfidf_df = idf_pipeline.transform(review)

tfidf_df.show(10)
#training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0)

#training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0)
#[training_df.count(), validation_df.count(), testing_df.count()]
예제 #19
0
# A linear regression object
regression = LinearRegression(labelCol='duration')

--------------------------------------------------
# Exercise_2 
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

--------------------------------------------------
# Exercise_3 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")
예제 #20
0
def Indexer(spark, train_address, val_address, tst_address):

    beg = time()
    df_train = spark.read.parquet(train_address)
    df_val = spark.read.parquet(val_address)
    df_test = spark.read.parquet(tst_address)

    print('File Reading Finished')

    # subsample
    # here we include the last 110K users (for validation/testing)
    # want to include all 110K users at the end and randomly draw 10% others
    subsample_frac = 0.1
    all_user_ids = [
        row['user_id']
        for row in df_train.select('user_id').distinct().collect()
    ]
    val_user_ids = [
        row['user_id']
        for row in df_val.select('user_id').distinct().collect()
    ]
    test_user_ids = [
        row['user_id']
        for row in df_test.select('user_id').distinct().collect()
    ]
    train_user_ids = list(
        set(all_user_ids) - set(val_user_ids) - set(test_user_ids))
    selected_train_ids = sample(train_user_ids,
                                round(len(train_user_ids) * 0.2))

    # >>> len(all_user_ids)
    # 1129318
    # >>> len(val_user_ids)
    # 10000
    # >>> len(test_user_ids)
    # 100000
    # >>> len(train_user_ids)
    # 1019318

    df_train = df_train.where(
        df_train.user_id.isin(selected_train_ids + val_user_ids +
                              test_user_ids))

    print('Sampling Finished')

    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_numeric")
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_numeric")

    model = Pipeline(stages=[user_indexer, track_indexer]).fit(
        df_train.union(df_val).union(df_test))
    df_train, df_val, df_test = [
        model.transform(x) for x in (df_train, df_val, df_test)
    ]

    df_train = df_train.select("user_id_numeric", "track_id_numeric", "count")
    df_val = df_val.select("user_id_numeric", "track_id_numeric", "count")
    df_test = df_test.select("user_id_numeric", "track_id_numeric", "count")

    print('Formatting Finished')

    df_train_subsampled.write.parquet("./train_formatted.parquet",
                                      mode='overwrite')
    df_val.write.parquet("./val_formatted.parquet", mode='overwrite')
    df_test.write.parquet("./test_formatted.parquet", mode='overwrite')
    end = time()
    print('Indexer and Subsampler succeed. Took %f s' % (end - beg))
    return
예제 #21
0
assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy'], outputCol='features')

# Split the data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

# Fit a Logistic Regression model to the training data
regression = LinearRegression(labelCol='consumption')

# Combine steps into a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# run fit on training data
pipeline = pipeline.fit(kars_train)

# Make predictions on the testing data
prediction = pipeline.transform(kars_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("consumption", 'prediction').count().show(8)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(pipeline.stages[REGRESSION_STAGE].coefficients))
print("Intercept: %s" % str(pipeline.stages[REGRESSION_STAGE].intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = pipeline.stages[REGRESSION_STAGE].summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

spark.stop()
'''
예제 #22
0
mlSourceDF = featureeddf
mlSourceDF.printSchema()
mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in mlSourceDF.columns if 'Lag' in x])
# after creating all lag features, we can drop NA columns on the key columns
# drop na to avoid error in StringIndex 
mlSourceDF = mlSourceDF.na.drop(subset=["ServerIP","SessionStartHourTime"])
# indexing
columnsForIndex = ['dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth', 'hourofday', 
                     'Holiday', 'BusinessHour', 'Morning']

mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in columnsForIndex ])

sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed').setHandleInvalid("skip") for x in columnsForIndex]
indexModel = Pipeline(stages=sIndexers).fit(mlSourceDF)
mlSourceDF = indexModel.transform(mlSourceDF)
# save model for operationalization
indexModel.write().overwrite().save(stringIndexModelFile)

# encoding for categorical features
catVarNames=[x + '_indexed' for x in columnsForIndex ]

columnOnlyIndexed =   [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)<2 ]
columnForEncode = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)>=2 ]

info['columnOnlyIndexed'] = columnOnlyIndexed
info['columnForEncode'] = columnForEncode

# save info to blob storage
write_blob(info, infoFile, storageContainer, storageAccount, storageKey)
                                  outputCol="features")

############ Classifiers
rfC = RandomForestClassifier(labelCol="Survived",
                             featuresCol="features",
                             numTrees=300,
                             maxDepth=5)
gbtC = GBTClassifier(labelCol="Survived", featuresCol="features", maxIter=50)

pipeline = Pipeline().setStages([
    sex_stringIndexer, age_discretizer, fare_discretizer,
    embarked_stringIndexer, embarked_encoder, VectorAssembler, rfC
]).fit(train_df)

##### Applying pipeline
train_piped = pipeline.transform(train_df)
test_piped = pipeline.transform(test_df)

############################################### Feature importances
print("\n----------- Feature importances")
rfCmodel = pipeline.stages[6]
for feature_name, feature_importance in sorted(zip(
        features_column, rfCmodel.featureImportances),
                                               key=lambda x: -x[1]):
    print("%20s: %s" % (feature_name, feature_importance))

############################################## Exporting
df_predictions = test_piped.select("prediction").toPandas().reset_index()
df_predictions['index'] = df_predictions['index'] + 892
df_predictions.columns = ['PassengerId', 'Survived']
예제 #24
0
tree = DecisionTreeClassifier(labelCol='Survived')
rf = RandomForestClassifier(labelCol='Survived')

# 4. Create pipeline
from pyspark.ml import Pipeline
#pipeline = Pipeline(stages=[indexer, onehot, assembler, tree])
pipeline = Pipeline(stages=[
    title_extractor, indexer1, indexer2, indexer3, onehot, assembler, rf
])

# 5. Fit the model
pipeline = pipeline.fit(passengers_train)

# 6. Make predictions
#from pyspark.ml.evaluation import BinaryClassEvaluator
prediction = pipeline.transform(passengers_train)
prediction.show(5)
prediction.select('Survived', 'prediction', 'probability').show(5, False)

# Create a confusion matrix
prediction.groupBy('Survived', 'prediction').count().show()

TP = prediction.filter('Survived == 1 AND prediction == 1').count()
TN = prediction.filter('Survived == 0 AND prediction == 0').count()
FP = prediction.filter('Survived == 0 AND prediction == 1').count()
FN = prediction.filter('Survived == 1 AND prediction == 0').count()

# Compute accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)
print('Accuracy is %f' % accuracy)
예제 #25
0
df.show()
df.cache()

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
import pyspark.sql.functions as f

model = Pipeline(stages=[
    RegexTokenizer(
        pattern="", inputCol="title", outputCol="tokens", minTokenLength=1),
    NGram(n=3, inputCol="tokens", outputCol="ngrams"),
    HashingTF(inputCol="ngrams", outputCol="vectors"),
    MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=10)
]).fit(df)

df_hashed = model.transform(df)

df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.9)

#show all matches (including duplicates)
df_matches.select(
    f.col('datasetA.id').alias('id_A'),
    f.col('datasetB.id').alias('id_B'), f.col('distCol')).show()

#show non-duplicate matches
df_matches.select(
    f.col('datasetA.id').alias('id_A'),
    f.col('datasetB.id').alias('id_B'),
    f.col('distCol')).filter('id_A < id_B').show()
rf_pipeline = Pipeline(stages=[va, rf]).fit(training_df)


# In[26]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator


# In[27]:

bce = BinaryClassificationEvaluator()


# In[28]:

bce.evaluate(lr_pipeline.transform(validation_df))


# In[29]:

bce.evaluate(rf_pipeline.transform(validation_df))


# In[30]:

lr_model = lr_pipeline.stages[-1]


# In[31]:

pd.DataFrame(list(zip(airlineCleanDF.columns[12:19], lr_model.coefficients.toArray())),
예제 #27
0
    def train(
        self,
        df: DataFrame,
        params_map: Optional[Dict[str, List[Any]]] = None,
        num_folds: Optional[int] = 10,
        collect_sub_models: Optional[bool] = False,
        return_cv: Optional[bool] = False
    ) -> Union[PipelineModel, Tuple[PipelineModel, CrossValidatorModel]]:
        """
        Train model.

        Params
        ------
        df: Spark DataFrame
            Input train data

        params_map: Optional[Dict[str, List[Any]]] (default=None)
            Parameters mapping to grid search over

        num_folds: Optional[int] (default=10)
            Number of cross-validation folds

        collect_sub_models: Optional[bool] (default=False)
            Collect models per fold per parameter
            combination

        return_cv: Optional[bool] (default=False)
            Additionally return the CrossValidatorModel
            object or not

        Returns
        -------
            self: PipelineModel
                The (best) model trained on df.
            cv_model: Optional[CrossValidatorModel]
                The CrossValidatorModel object.
        """
        # get input features
        binary, numeric, categorical = self._get_features(df)

        # convert categorical to numeric labels
        indexed_cols = [f'{c}_idx' for c in categorical]
        indexers = [
            StringIndexer(inputCol=c[:-6], outputCol=c) for c in indexed_cols
        ]
        self.features = binary + numeric + indexed_cols
        self.logger.info(f'Final model features list: {self.features}')

        # assemble features into feature vector
        assembler = VectorAssembler(inputCols=self.features,
                                    outputCol=self.estimator.getFeaturesCol())
        p = Pipeline(stages=indexers + [assembler]).fit(df)
        self.logger.info('Index and vector assemble features')
        df = p.transform(df)\
            .select(self.estimator.getFeaturesCol(), self.estimator.getLabelCol())

        # if provided, set estimator params map
        if params_map:
            self.params_map = params_map

        # run cross-validation and choose the best set of parameters
        self.logger.info('Start Cross Validation')
        cv_params = {
            'estimator': self.estimator,
            'estimatorParamMaps': self.__params_grid,
            'evaluator': self.evaluator,
            'numFolds': num_folds,
            'collectSubModels': collect_sub_models
        }
        cv_model = CrossValidator(**cv_params).fit(df)

        # set the best model
        p.stages.append(cv_model.bestModel)
        self.best_model = p
        self.logger.info(
            f'Set the best model with best params: {self.best_params}')

        if return_cv:
            return self.best_model, cv_model
        else:
            return self.best_model
    # define numerical assembler first for scaling
    numericalAssembler = VectorAssembler(inputCols=numericalColumnsImputed,
                                         outputCol='numerical_cols_imputed')
    stages += [numericalAssembler]

    # define the standard scaler stage for the numerical columns
    scaler = StandardScaler(inputCol='numerical_cols_imputed',
                            outputCol="numerical_cols_imputed_scaled")
    stages += [scaler]  # already a list so no need for brackets

    # Perform assembly stage to bring together features
    assemblerInputs = [c + "classVec" for c in categoricalColumns
                       ] + ["numerical_cols_imputed_scaled"]
    # features contains everything, one hot encoded and numerical
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    stages += [assembler]

    # define the model stage at the end of the pipeline
    lr = LogisticRegression(labelCol="label",
                            featuresCol="features",
                            maxIter=10)
    stages += [lr]

    # Random train test split with seed
    (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100)

    # Define the entire pipeline and fit on the train data and transform on the test data
    clfPipeline = Pipeline().setStages(stages).fit(trainingData)
    clfPipeline.transform(testData)
예제 #29
0
    "select * from cours_spark.meteoMensuelle order by 1").cache()

modelA = VectorAssembler().\
                setInputCols(
                       ['Janvier', 'Fevrier', 'Mars',
                        'Avril', 'Mai', 'Juin', 'Juillet',
                        'Aout', 'Septembre', 'Octobre',
                        'Novembre', 'Decembre']).\
                 setOutputCol('variables')

modelN = StandardScaler().\
                setInputCol("variables").\
                setOutputCol("vNormalisees").\
                setWithStd(True).\
                setWithMean(False)

modelACP = PCA().\
            setInputCol("vNormalisees").\
            setOutputCol("vACP").\
            setK(2)

modelKM = KMeans().setK(7).\
                   setFeaturesCol("vACP").\
                   setPredictionCol("vKM")

modelPipe = Pipeline(stages=[modelA, modelN, modelACP, modelKM]).fit(donnees)

donneesKM = modelPipe.transform(donnees)

donneesKM.select("Ville", "vKM").show(5)
예제 #30
0
class Pipeline:

    def __init__(self, spark, train_date, score_date, train_pct, validate_pct=0.):
        assert (0. <= train_pct <= 1.)
        assert (0. <= validate_pct <= 1.)
        assert (train_pct + validate_pct <= 1.)

        self.spark = spark
        self.train_date = train_date
        self.score_date = score_date
        self.train_pct = train_pct
        self.validate_pct = validate_pct

    def _load_feature_df(self, feature_date, training):
        phase = 'training' if training else 'scoring'
        print('[ ' + str(datetime.utcnow()) + ' ] : Loading ' + phase + ' feature data')

        table_suffix = 't365d' if training else 'scoring'
        features_df = self.spark.sql('from grp_gdoop_clv_db.keep_cdf_final_features_' + table_suffix) \
            .filter(F.col('record_date') == feature_date) \
            .drop('record_date', 'zip_code_cat_x')

        if training:
            target_df = self.spark.sql('select * from grp_gdoop_clv_db.ce_keep_deact_target') \
                .filter(F.col('record_date') == feature_date) \
                .select('consumer_id', 'deactivated')

            final = features_df \
                .join(target_df, features_df.consumer_id == target_df.consumer_id, how='left') \
                .drop(target_df.consumer_id)

            return final

        else:
            return features_df

    def _train_validate_split(self, features_df):
        print('[ ' + str(datetime.utcnow()) + ' ] : Splitting training data into model training and validation data')
        splits = features_df.randomSplit([self.train_pct, self.validate_pct, 1 - self.train_pct - self.validate_pct])
        return splits[0], splits[1]

    def _make_feature_list(self, all_cols, cat_cols, indexers):
        features = list(filter(lambda x: x.endswith('_x') and not x.endswith('_cat_x'), all_cols))
        for i, col in enumerate(cat_cols):
            for label in indexers[i].labels:
                features.extend([col + '_' + re.sub('\W+', '_', str(label).strip())])
        self.feature_list = features

    def _one_hot_encode_pl(self, train_raw):
        print('[ ' + str(datetime.utcnow()) + ' ] : Creating feature engineering pipeline')
        all_cols = train_raw.columns
        cat_cols = list(filter(lambda x: x.endswith('_cat_x'), all_cols))

        indexers = [StringIndexer(inputCol=c, outputCol=c.replace('_cat_x', '_index'),
                                  handleInvalid='keep') for c in cat_cols]
        one_hots = [OneHotEncoderEstimator(inputCols=[c.replace('_cat_x', '_index')],
                                           outputCols=[c.replace('_cat_x', '_vec_x')], handleInvalid='keep',
                                           dropLast=False) for c in cat_cols]

        self.one_hot_plm = MLPipeline(stages=indexers + one_hots).fit(train_raw)
        self._make_feature_list(all_cols, cat_cols, self.one_hot_plm.stages[:len(cat_cols)])

    def _assemble_features(self, raw_df, data_type):
        print('[ ' + str(datetime.utcnow()) + ' ] : Feature engineering ' + data_type + ' data')
        cat_cols = list(filter(lambda x: x.endswith('_cat_x'), raw_df.columns))
        df = self.one_hot_plm.transform(raw_df) \
            .drop(*cat_cols)

        features = list(filter(lambda x: x.endswith('_x'), df.columns))
        assembler = VectorAssembler(inputCols=features, outputCol='features', handleInvalid='keep')
        return assembler.transform(df)

    def _training_data(self, validate_model):
        train_features_df = self._load_feature_df(self.train_date, True)
        train_raw, validate_raw = self._train_validate_split(train_features_df)
        train_raw.cache()

        # Create one-hot encoding pipeline that will be applied to all DFs
        self._one_hot_encode_pl(train_raw)

        train_df = self._assemble_features(train_raw, 'model training').cache()
        train_raw.unpersist()

        if validate_model:
            validate_raw.cache()
            validate_df = self._assemble_features(validate_raw, 'model validation').cache()
            validate_raw.unpersist()
        else:
            validate_df = None

        return train_df, validate_df

    def _scoring_data(self):
        score_features_df = self._load_feature_df(self.score_date, False).cache()
        score_df = self._assemble_features(score_features_df, 'scoring').cache()
        score_features_df.unpersist()
        return score_df

    def run(self, validate_model, score_active_users):
        print('\nDATA PIPELINE\n')
        train_df, validate_df = self._training_data(validate_model)
        if score_active_users:
            score_df = self._scoring_data()
        else:
            score_df = None
        return {'training': train_df, 'validation': validate_df, 'scoring': score_df, 'features': self.feature_list}

    def __repr__(self):
        return '<Pipeline(train_date={0}, score_date={1})>'.format(self.train_date, self.score_date)

    def __str__(self):
        return '<Pipeline(train_date={0}, score_date={1})>'.format(self.train_date, self.score_date)
onthot = onthot.fit(cars_train)
cars_train = onehot.transform(cars_train)
cars_train  =assemble.transform(cars_train)
# Fit model to training data
regression = regression.fit(cars_train)
# Testing data
cars_test = indexer.transform(cars_test)
cars_test = onthot.transform(cars_test)
cars_test = assemble.transform(cars_test)
# Make predictions on testing data
predictions = regression.transform(cars_test)
# Cars model: Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, onehot, assemble, regression])
pipeline = pipenline.fit(cars_train)
predictions = pipeline.transform(cars_test)
# Cars model: stages
# The LinearRegression object (fourth stage -> index 3)
print(pipeline.stages[3].intercept)
print(pipeline.stages[3].coefficients)

# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy', 'dow_dummy']
)

# Assemble predictors into a single column