Exemplo n.º 1
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 2
0
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(title=w[1]))
            wordsDataFrame = spark.createDataFrame(rowRdd)


            
            # load model pipeline
            model = PipelineModel.load('kmeans')
            prediction = model.transform(wordsDataFrame).select("6_kmeans")
            prediction.show(5)
        except:
            pass
Exemplo n.º 3
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 4
0
def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''

    # File names
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    # Reading the parquet files
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    w = Window.partitionBy("user_id")

    def z_score(c, w):
        return (col(c) - mean(c).over(w)) / stddev(c).over(w)

    test_z = test.select("user_id", "track_id", "count",
                         z_score("count", w).alias("count2"))
    test_z.createOrReplaceTempView('test_z')
    test = spark.sql(
        'SELECT user_id, track_id, COALESCE(count2,count) AS count FROM test_z'
    )
    test.createOrReplaceTempView('test')
    print('Test Z created')

    # Creating the train sample
    # All validation and test users from train, and 10% of the rest of the train

    train_sample = spark.read.parquet(
        'hdfs:/user/dev241/extension3_zscores.parquet')
    print("Training sample loaded")

    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)

    #change to best
    rank = 78
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    #model
    als = ALS(rank=rank,
              alpha=alpha,
              regParam=reg,
              userCol="user_idx",
              itemCol="track_idx",
              ratingCol="count",
              coldStartStrategy="drop",
              implicitPrefs=True)
    model = als.fit(train_idx)
    print("Model fit for Ext3 done")
    model.save("Extension3(z_score)")
    print("Model save for Ext3 done")

    #test ranking metrics
    test_idx = test_idx.select('user_idx', 'track_idx', 'count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(
        F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = model.recommendForUserSubset(test_users, track_number)
    join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx)
    predictionAndLabels = join.rdd.map(lambda r: (
        [track.track_idx for track in r.recommendations], r.test_labels))
    metrics = RankingMetrics(predictionAndLabels)
    mavgp = metrics.meanAveragePrecision
    print("Ext 3 Test mean Average Precision : ", mavgp)
    pass
Exemplo n.º 5
0
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml import PipelineModel

app = Flask(__name__)




# get spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Sparkify") \
    .getOrCreate()

# load model
model = PipelineModel.load('../model/classifier')


# index webpage displays cool visuals and receives user input text for model
@app.route('/')
@app.route('/index')
def index():
    
    # extract data needed for visuals
     # get spark context
    sc = SparkContext.getOrCreate()
    
    # create spark dataframe to predict customer churn using the model
    #[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend]
    gender = ''
    level = 0 
## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")




##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold = 0.5):
    smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple
    prediction = model.transform(smsTextDF)
    return prediction.select("prediction_output.p1").first()["p1"] > hamThreshold


print(isSpam("Michal, h2oworld party tonight in MV?", loaded_model))
Exemplo n.º 7
0
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# Boilerplate Spark stuff:
spark = SparkSession\
    .builder\
    .appName("diabetes_patient_readmission_onlinescore")\
    .getOrCreate()
sc = spark.sparkContext



#Load up patient discharge file, use the trained model and predict readmissions
test = spark.table("diabetic_data_original_parquet")
modelPath = "hdfs:///tmp/diabetes/diabetic_data_model/"


from pyspark.ml import PipelineModel
sameModel = PipelineModel.load(modelPath)
predictions = sameModel.transform(test)
predictions.select('encounter_id','patient_nbr','prediction').filter('prediction = 1').show(50)
"""
验证模型
"""

start=time.time()
# 再次对测试集的数据进行词转向量的转化
test_set = model.transform(test_set)
# 再次将多列数据转化为单列的向量列(决策树可以识别的类型)
# test_set = assembler.transform(test_set)
# 再次使用cv_pipelineModel进行验证,把在pipeline中的所有transform都执行一遍(???)
bestDt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="entropy", maxDepth=bestModel.depth, maxBins=32)
dt_pipeline = Pipeline(stages=[assembler, bestDt])  # (???)
dt_model = dt_pipeline.fit(manbing)
dt_model.write().overwrite().save("./models/courage_dtmodel/")
sameDTModel = PipelineModel.load("./models/courage_dtmodel/")

predictions = sameDTModel.transform(test_set)  # (???)
# 使用评估器对预测结果进行评估得到auc
auc = evaluator.evaluate(predictions)  # (???)
print("auc="+str(auc))
acc = predictions.filter(predictions['label'] == predictions['prediction']).count() / float(predictions.count())
print("acc="+str(acc))
end = time.time()
print("预测用时:{}".format(end-start))

"""
auc=0.9834579598810581
acc=0.9598010774968918
"""
Exemplo n.º 9
0
from pyspark.ml import PipelineModel

newsModel = PipelineModel.load(
    "hdfs://localhost:19000/user/Waseem/bestPipeline")
sportsModel = PipelineModel.load(
    "hdfs://localhost:19000/user/Waseem/sportsPipeline")
model = [newsModel, sportsModel]
Exemplo n.º 10
0
sentenceDataFrame = spark.createDataFrame([(0, 1, 2), (0, 1, 2), (1, 1, 2)],
                                          ["label", "a", 'b'])
from pyspark.sql import functions

df = sentenceDataFrame.withColumn('c', functions.lit(np.nan))
df.show()

#############
#测试pipleline
#############
from pyspark.ml import Pipeline, PipelineModel, Transformer

blankTransformer = BlankTransformer(inputCols=["a", "b", "c"],
                                    outputCols=["a_1", "b_1", "c_1"])

p = Pipeline(stages=[blankTransformer])
# df = spark.sparkContext.parallelize([(1, None), (2, 1.0), (3, 0.5)]).toDF(["key", "value"])
pm = p.fit(df)
pm.transform(df).show()

###########################
#测试保存piplemodel,和加载测试
############################
pm.write().overwrite().save('./test/test.model')
pm2 = PipelineModel.load('./test/test.model')
print('matches?',
      pm2.stages[0].extractParamMap() == pm.stages[0].extractParamMap())
print(pm2.stages[0].extractParamMap())
pm2.transform(df).show()
def transform(df, model_path, prediction_column):
	pipeline_model = PipelineModel.load(model_path)
	predictions = pipeline_model.transform(df)
	predictions = predictions.drop(*["features", "rawPrediction", "probability", "categorical_features", "continuous_features", "continuous_vector"])
	predictions = predictions.withColumnRenamed("prediction", prediction_column)
	return predictions
from pyspark.ml import PipelineModel
import pyspark.sql.functions as F

pipelinePath = "./LDA-pipeline-model"

pipeline_model = PipelineModel.load(pipelinePath)

# 5. check the topic distribution among dataset
df_with_topics = pipeline_model.transform(df).select("tweet_text",
                                                     "topicDistribution")
to_array = F.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
df_with_topics_toArray = df_with_topics.select(
    "tweet_text",
    to_array("topicDistribution").alias("topicDistributionArray"))

df_with_topics_final = df_with_topics_toArray.select(
    "tweet_text" +
    [(F.col("topicDistributionArray")[i]).alias("topic_" + str(i))
     for i in range(10)])

df_with_topics_final.agg(
    *[F.sum(F.col("topic_" + str(i) for i in range(n_topics)))])
Exemplo n.º 13
0
# Guardamos version de lmp_version

dataset = dataset.withColumn('lmp_version_split',
                             F.split(F.col('lmp_version'), "-").getItem(0))

# Separamos nap, uap y lap de la columna address

dataset = dataset.withColumn('nap', dataset.address.substr(1, 5))
dataset = dataset.withColumn('uap', dataset.uap_lap.substr(1, 2))
dataset = dataset.withColumn('lap', dataset.uap_lap.substr(4, 11))

# StringIndexer

string_indexer_model_path = "{}/data/stringIndexerModel.bin".format(base_path)
string_indexer = PipelineModel.load(string_indexer_model_path)
dataset = string_indexer.transform(dataset)

# MinMaxScaler

minMaxScaler_model_path = "{}/data/minMaxScalerModel.bin".format(base_path)
minMaxScaler = PipelineModel.load(minMaxScaler_model_path)
dataset = minMaxScaler.transform(dataset)

# OneHotEncoding

ohe_model_path = "{}/data/oheModel.bin".format(base_path)
ohe = PipelineModel.load(ohe_model_path)
dataset = ohe.transform(dataset)

# VectorAssembler
Exemplo n.º 14
0
import subprocess
from pyspark.mllib.evaluation import MulticlassMetrics
from elasticsearch import Elasticsearch
es=Elasticsearch([{'host':'localhost','port':9200}])
indexName="twitternb"
indexName2="twitterlr"
typeName1="NaiveB"
typeName2="logisticR"
# import pyspark.sql.Row
# import pyspark.implicits._
sc =SparkContext()
sqlContext = SQLContext(sc)
consumer = KafkaConsumer('twitter',
                         group_id='my-group',
                         bootstrap_servers=['localhost:9092'])
nbModel = PipelineModel.load("APJ180001_nb.model")
lrModel = PipelineModel.load("APJ180001_lr.model")

evaluator = MulticlassClassificationEvaluator()
count=0;
sum=0;
avg=0;
sum2=0
count2=0
labels={}
index = 0

accidentalTweetsNB = open("accidentalTweetsNB.txt", 'a+')
accidentalTweetsLR = open("accidentalTweetsLR.txt", 'a+')

for message in consumer:
Exemplo n.º 15
0
def construct_component_from_pipe_identifier(
        language,
        nlp_ref,
        nlu_ref,
        path=None,
        is_licensed=False):  # -> NLUPipeline
    """
    creates a list of components from a Spark NLP Pipeline reference
    1. download pipeline
    2. unpack pipeline to annotators and create list of nlu components
    3. return list of nlu components
    :param is_licensed: Weather pipe is licensed or not
    :param nlu_ref: Nlu ref that points to this pipe
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp pretrained pipeline
    :param path: Load component_list from HDD
    :return: Each element of the Spark NLP pipeline wrapped as a NLU component inside a list
    """
    if 'language' in nlp_ref:
        # special edge case for lang detectors
        language = 'xx'
    if path is None:
        if is_licensed:
            pipe = PretrainedPipeline(nlp_ref,
                                      lang=language,
                                      remote_loc='clinical/models')
        else:
            pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else:
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []
    os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict()
    hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict()
    ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict()
    for jsl_anno_object in iterable_stages:
        anno_class_name = type(jsl_anno_object).__name__
        logger.info(
            f"Extracting model from Spark NLP pipeline: obj= {jsl_anno_object} class_name = {anno_class_name} and creating Component"
        )
        if anno_class_name in os_annos.keys():
            jsl_anno_id = os_annos[anno_class_name]
            nlu_component = ComponentMap.os_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.open_source)
            constructed_components.append(nlu_component)
        elif anno_class_name in hc_annos.keys():
            # Licensed HC
            jsl_anno_id = hc_annos[anno_class_name]
            nlu_component = ComponentMap.hc_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.hc)
            constructed_components.append(nlu_component)
        elif anno_class_name in ocr_annos:
            # Licensed OCR (WIP)
            jsl_anno_id = ocr_annos[anno_class_name]
            nlu_component = ComponentMap.ocr_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.ocr)
            constructed_components.append(nlu_component)
        else:
            raise ValueError(
                f'Could not find matching nlu component for annotator class = {anno_class_name}'
            )
        if None in constructed_components or len(constructed_components) == 0:
            raise Exception(
                f"Failure inferring type anno_class={anno_class_name} ")
    return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
        PipeUtils.set_column_values_on_components_from_pretrained_pipe(
            constructed_components, nlp_ref, language, path))
Exemplo n.º 16
0
run_mode = 'batch'  # 'batch' or 'streaming'
outputs = 'outputs.json'
#################################################

if len(sys.argv) != 2:
    print("Usage: <models folder> ")
    exit(1)

models_folder = sys.argv[1]

spark = SparkSession.builder.master('local[{}]'.format(spark_threads)).appName(
    'local-testing-pyspark-context').getOrCreate()

# Load the model
print("Loading model...")
model = PipelineModel.load(models_folder + '/' + ml_model_name)
print("Done.")

#with open(models_folder + '/' + input_dtypes, 'r') as f:
#    types = {x:np.dtype(y) for x, y in json.load(f).items()}


def read_data():
    print("Reading data")
    df = pd.read_json(stage2_outputs)
    if sample_size:
        df = df.sample(sample_size)
    print("Size of test data is : ", len(df))
    return df

Exemplo n.º 17
0
             header=True,
             maxFilesPerTrigger=1)\
    .load()

# если в числовом поле LotFrontage есть null, всю строку записывает нулевой
# поэтому задаем в схеме строчный тип, потом меняем на Float и заменяем null-значения
# визуально нет Id c null-значением, откуда потом вылетают - не понимаю
test = test_start\
    .withColumn("LotFrontage", F.expr("CAST(LotFrontage as FLOAT)"))\
    .na.fill({"LotFrontage": 60.0, "Id": 0})

out = console_output(test.select("Id", "LotFrontage", "LotArea"), 100)
out.stop

# довольно долго подгружается
pipeline_model = PipelineModel.load("my_GB_model8_ob")
"""
/cassandra/bin/cqlsh 10.0.0.18 — запуск

#создать схему
#CREATE  KEYSPACE  lesson8
#   WITH REPLICATION = {
#      'class' : 'SimpleStrategy', 'replication_factor' : 1 } ;

use lesson8;

DROP TABLE houses_price_prediction;
CREATE TABLE IF NOT EXISTS houses_price_prediction
(Id int primary key, 
SalePrice int);
"""
Exemplo n.º 18
0
def main(sentiment_input, user_input, review_input, model_input,
         output_folder):
    # read input files
    df_sentiment = spark.read.csv(sentiment_input, header=True)
    df_user = spark.read.parquet(user_input)
    df_review = spark.read.parquet(review_input)

    # get 50 users
    df_50_users = df_user.limit(50)

    # cross join user and business
    df_usr_bus_all = df_50_users \
                    .crossJoin(df_sentiment) \
                    .where(df_sentiment['ZipCode'].isNull() == False) \
                            .select(
                                df_sentiment['BusinessID'], \
                                df_user['UserID'], \
                                df_user['UserName'], \
                                df_user['ReviewCount'].alias('UserReviewCount'), \
                                df_user['AverageStars'].alias('UserAverageStars'), \
                                functions.lit(0).alias('ReviewStars'), \
                                functions.dayofyear(functions.current_date()).alias('ReviewDayOfYear'), \
                                df_sentiment['Name'].alias('BusinessName'), \
                                df_sentiment['ZipCode'].alias('BusinessPostalCode'), \
                                df_sentiment['ZipCode'].substr(1, 3).alias('BusinessNeighborhood'), \
                                df_sentiment['Latitude'].cast(types.FloatType()), \
                                df_sentiment['Longitude'].cast(types.FloatType()), \
                                df_sentiment['avg_neg'].cast(types.FloatType()).alias('AverageNegative'), \
                                df_sentiment['avg_neu'].cast(types.FloatType()).alias('AverageNeutral'), \
                                df_sentiment['avg_pos'].cast(types.FloatType()).alias('AveragePositive'), \
                                df_sentiment['avg_composite_score'].cast(types.FloatType()).alias('AverageComposite'))

    # left join with reviews
    df_joined = df_usr_bus_all.join(df_review, ['BusinessID', 'UserID'], 'left_outer') \
                            .select(df_review['ReviewID'], \
                                    df_usr_bus_all['BusinessID'], \
                                    df_usr_bus_all['UserID'], \
                                    df_usr_bus_all['UserName'], \
                                    df_usr_bus_all['UserReviewCount'], \
                                    df_usr_bus_all['UserAverageStars'], \
                                    df_usr_bus_all['ReviewStars'], \
                                    df_usr_bus_all['ReviewDayOfYear'], \
                                    df_usr_bus_all['BusinessName'], \
                                    df_usr_bus_all['BusinessPostalCode'], \
                                    df_usr_bus_all['BusinessNeighborhood'], \
                                    df_usr_bus_all['Latitude'], \
                                    df_usr_bus_all['Longitude'], \
                                    df_usr_bus_all['AverageNegative'], \
                                    df_usr_bus_all['AverageNeutral'], \
                                    df_usr_bus_all['AveragePositive'], \
                                    df_usr_bus_all['AverageComposite'])

    # get restaurants that user has not visited
    df_not_visited_rests = df_joined.where(df_joined['ReviewID'].isNull())

    # load the model
    loaded_model = PipelineModel.load(model_input)

    # use the model to make predictions
    predictions = loaded_model.transform(df_not_visited_rests)
    predictions_init = predictions.select(predictions['BusinessID'], \
                                          predictions['BusinessName'], \
                                          predictions['BusinessPostalCode'], \
                                          predictions['BusinessNeighborhood'], \
                                          predictions['UserID'], \
                                          predictions['UserName'], \
                                          predictions['UserReviewCount'], \
                                          predictions['UserAverageStars'], \
                                          predictions['ReviewDayOfYear'], \
                                          predictions['prediction'].alias('PredictedReviewStar'), \
                                          predictions['Latitude'], \
                                          predictions['Longitude'], \
                                          predictions['AverageNegative'], \
                                          predictions['AverageNeutral'], \
                                          predictions['AveragePositive'], \
                                          predictions['AverageComposite'])

    # change scores > 5 to 5 and < 0 to 0
    predictions_final = predictions_init.withColumn('FinalStar', \
                                                        functions.when(predictions_init["PredictedReviewStar"] >= 5, 5) \
                                                        .otherwise(functions.when(predictions_init["PredictedReviewStar"] <= 0, 0) \
                                                        .otherwise(predictions_init['PredictedReviewStar'])))

    # partition By user
    window = Window.partitionBy(predictions_final['UserID']).orderBy(
        predictions_final['FinalStar'].desc())

    # get top 10 scores for each user based on partition
    prediction_to_save = predictions_final.select(
        '*',
        functions.row_number().over(window).alias('rank')).filter(
            col('rank') <= 10)

    # save predictions to output
    prediction_to_save.coalesce(1).write.csv(output_folder + '/TestModel',
                                             header=True)
Exemplo n.º 19
0
from pyspark import SparkContext
from pyspark.ml import PipelineModel
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from nltk.corpus import stopwords


if __name__ == '__main__':

    sc = SparkContext.getOrCreate()
    pipeline = PipelineModel.load('hdfs:///model_lr')

    file_path = '/data/twitter_data/z_sample.csv'

    # create a spark context
    sc = SparkContext.getOrCreate()

    # create a sql spark context
    sql = SQLContext(sc)

    # defining a schema for the data
    schema = StructType([
        StructField('polarity', IntegerType(), True),
        StructField('id', StringType(), True),
        StructField('date', StringType(), True),
        StructField('query', StringType(), True),
        StructField('user', StringType(), True),
        StructField('text', StringType(), True)
    ])
    useless_columns = ['id', 'date', 'query', 'user']
Exemplo n.º 20
0
def main(file1, file2, input_model, u_id, sim_bus_limit=3):
    data = spark.read.parquet(file1)
    data.createOrReplaceTempView('review')
    df_business = spark.read.parquet(file2)
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("score", IntegerType(), True),
        StructField("input_business_id", StringType(), True)
    ])

    similar_businesses_df = spark.createDataFrame([], schema)
    df = data.select('business_id', 'text')
    #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100)
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed(
        '_1', 'business_id').withColumnRenamed('_2', 'text')

    # create text preprocessing pipeline
    # Build the pipeline
    # tokenize review
    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='text',
                                    outputCol='text_token')
    #yelpTokenDF = regexTokenizer.transform(review_df)

    # filter stopwords
    stopWordsRemover = StopWordsRemover(inputCol='text_token',
                                        outputCol='nonstopwrd')
    #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF)

    # TF
    countVectorizer = CountVectorizer(inputCol='nonstopwrd',
                                      outputCol='raw_features',
                                      minDF=2)
    #yelp_CountVec = cv.transform(yelp_remove_df)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="idf_vec")
    word2Vec = Word2Vec(vectorSize=500,
                        minCount=5,
                        inputCol='nonstopwrd',
                        outputCol='word_vec',
                        seed=123)
    #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
    pipeline = Pipeline(stages=[
        regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec
    ])
    #pipeline_model = pipeline.fit(review_df)
    #pipeline_model.write().overwrite().save('content_userid')

    pipeline_model = PipelineModel.load(input_model)
    reviews_by_business_df = pipeline_model.transform(review_df)
    all_business_vecs = reviews_by_business_df.select(
        'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
    usr_rev_bus = spark.sql(
        'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"'
        .format(u_id))

    bus_list = [i for i in usr_rev_bus.collect()]

    for b_id in bus_list:
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0]
        similar_business_rdd = sc.parallelize(
            (i[0], float(CosineSim(input_vec, i[1])))
            for i in all_business_vecs)
        similar_business_df = spark.createDataFrame(
            similar_business_rdd).withColumnRenamed(
                '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy(
                    "score", ascending=False)
        similar_business_df = similar_business_df.filter(
            col("business_id") != b_id[0]).limit(10)
        similar_business_df = similar_business_df.withColumn(
            'input_business_id', lit(b_id[0]))
        # get restaurants similar to the user_id
        result = similar_businesses_df.union(similar_business_df)
    result.cache()
    # filter out those have been reviewd before by the user
    d = [i[0] for i in usr_rev_bus.collect()]
    df_1 = result.filter(~(col('business_id').isin(d))).select(
        'business_id', 'score')
    #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')])
    df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit)
    df_result = df_business.join(df_2, 'business_id',
                                 'right').select('business_id', 'score',
                                                 'name', 'categories',
                                                 'latitude', 'longitude')
    df_result.show()
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml import PipelineModel, Pipeline
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField, SelectField, FloatField
from wtforms.widgets import html5


app = Flask(__name__)

# Initiate SparkSession
spark = SparkSession.builder \
    .master('local') \
    .appName('sparkify') \
    .getOrCreate()

# load model
model_gbt = PipelineModel.load("../model/sparkify_model")

# Load dataframe
df_ML = spark.read.parquet("../model/sparkify.parquet")
df_pd = df_ML.toPandas()  
location_list = df_pd.location_first.unique().tolist()
location_list = [ (location,location) for location in location_list]

# Graph 1
graph1 = df_pd.groupby('gender')['churn'].value_counts(normalize=True).unstack()
g1_name1 = str(graph1.columns[0])
g1_name2 = str(graph1.columns[1])
g1_x = graph1.index
g1_y1 = graph1.values.T[0]
g1_y2 = graph1.values.T[1]
# Graph 2
Exemplo n.º 22
0
import numpy as np
from flask import Flask, request, abort

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import StringIndexerModel

app = Flask(__name__)

sc = SparkContext('local')
sqlContext = SQLContext(sc)
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
lr_model = PipelineModel.load('/spark_models/lr_model')
bc_model = PipelineModel.load('/spark_models/bc_model')
mc_model = PipelineModel.load('/spark_models/mc_model')

# https://stackoverflow.com/questions/45885044/getting-labels-from-stringindexer-stages-within-pipeline-in-spark-pyspark
mc_classes = classes = {
    x._java_obj.getOutputCol(): x.labels
    for x in mc_model.stages if isinstance(x, StringIndexerModel)
}
mc_classes = mc_classes['label']


@app.route('/lr')
def linear_regression():
    try:
        df = sqlContext.createDataFrame(
Exemplo n.º 23
0
def load_lda_model(spark):
    register_remove_punctuation_udf(spark)
    ldaPipelineModel = PipelineModel.load(
        "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel")
    #ldaPipelineModel.stages[0] = SQLTransformer(statement="SELECT jokeID, clean_text_udf(raw_text) text FROM __THIS__")
    return ldaPipelineModel
Exemplo n.º 24
0
def load_model(building_id, meter):

    model_path = "output/als_model_{0}_{1}".format(building_id, meter)
    return PipelineModel.load(model_path)
Exemplo n.º 25
0
def cargar_juez(path, tipo, mongo_uri=None):
    if tipo == 1 and mongo_uri:
        df = spark_session().read.json(path + "_trainingset")
        df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri)
    return PipelineModel.load(path)
Exemplo n.º 26
0
    lambda x: datetime.strftime(
        datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S'
        )
    )
df = df.withColumn("created_at", date_process(df.created_at))

################# Pre-processing the data
pre_process = udf(
    lambda x: re.sub(r'[^A-Za-z\n ]|(http\S+)|(www.\S+)', '', \
        x.lower().strip()).split(), ArrayType(StringType())
    )
df = df.withColumn("cleaned_data", pre_process(df.message)).dropna()

################# Passing into ml pipeline
model_path = SRC_DIR.joinpath('models')
pipeline_model = PipelineModel.load(model_path)

prediction  = pipeline_model.transform(df)

'''
The labels are labelled with positive (4) as 0.0 
negative (0) as 1.0
'''

prediction = prediction \
    .select(prediction.cleaned_data, prediction.created_at, \
         prediction.timestamp, prediction.message, prediction.prediction)

# print(prediction.schema)
################# Write to Delta
Exemplo n.º 27
0
from flask import Flask, jsonify, render_template, request
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel


import json

MASTER = 'local'
APPNAME = 'simple-ml-serving'
MODEL_PATH = 'file:///home/cdsw/cdsw-simple-serving-python/model/spark-model'

spark = SparkSession.builder.master(MASTER).appName(APPNAME).getOrCreate()
model = PipelineModel.load(MODEL_PATH)


def classify(input):
  #target_columns = input.columns + ["prediction"]
  target_columns = ["prediction"]
  return model.transform(input).select(target_columns).collect()

# webapp
app = Flask(__name__)


@app.route('/api/predict', methods=['POST'])
def predict():
  input_df = spark.sparkContext.parallelize([request.json]).toDF()
  output = classify(input_df)
  return jsonify(input=request.json, prediction=output)

@app.route('/')
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import PipelineModel
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder \
      .appName("Telco Customer Churn") \
      .master("local[*]") \
      .getOrCreate()
  
model = PipelineModel.load("models/spark/mlp") 

features = ["intl_plan", "account_length", "number_vmail_messages", "total_day_calls",
                        "total_day_charge", "total_eve_calls", "total_eve_charge",
                        "total_night_calls", "total_night_charge", "total_intl_calls", 
                        "total_intl_charge","number_customer_service_calls"]
def predict(args):
  account=args["feature"].split(",")
  feature = spark.createDataFrame([account[:1] + list(map(float,account[1:12]))], features)
  result=model.transform(feature).collect()[0].prediction
  return {"result" : result}

#features = ["intl_plan_indexed","account_length", "number_vmail_messages", "total_day_calls",
#                     "total_day_charge", "total_eve_calls", "total_eve_charge",
#                     "total_night_calls", "total_night_charge", "total_intl_calls", 
#                    "total_intl_charge","number_customer_service_calls"
predict({
  "feature": "no, 128, 25, 256, 110, 197.4, 50, 244.7, 91, 10, 5, 1"
}) 
Exemplo n.º 29
0
print(str(edate))

# paragraph 3 - send start email

send_mail(
    "Email Cadence Model Scoring Starting",
    "*****@*****.**",
    "Email Cadence Model Scoring: " + str(today),
)

# paragraph 4 - loading model from file

try:
    # load model from exported file
    lrModel = PipelineModel.load(
        "/user/datascience/test_db.db/features/lrModel")

    print("Model Loaded")
except Exception, e:
    print(str(e))
    errormsg = errormsg + "Model Failed Loading\n"

# paragraph 5 - data config

###SCORING
print("Scoring")

try:
    ##change date as parameter to take in current date (or date of score)
    scoring_query = "select * from test.features where ds = '" + str(
        edate) + "'"
Exemplo n.º 30
0
## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")


##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model):
    smsTextDF = spark.createDataFrame([(smsText, )],
                                      ["text"])  # create one element tuple
    prediction = model.transform(smsTextDF)
    return prediction.select("prediction").first()["prediction"] == "spam"


isSpamMsg = isSpam("Michal, h2oworld party tonight in MV?", loaded_model)
assert not isSpamMsg
Exemplo n.º 31
0
# finding spark
#import findspark
#findspark.init('/home/pfcor/spark-2.1.0-bin-hadoop2.7')

# misc
import datetime as dt
timestamp = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')

# init
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BANK_MODELO").getOrCreate()

# carregando modelo
from pyspark.ml import PipelineModel
pipelineModel = PipelineModel.load(
    'hdfs://elephant:8020/user/labdata/model/bank-pipeline-model-res/')
#pipelineModel = PipelineModel.load('model/bank-pipeline-model-res/')

# carregando dados
#data = spark.read.csv(
#    'data/new-data.csv',
#    sep=';',
#    header=True,
#    inferSchema=True
#)
data = spark.read.csv("hdfs://elephant:8020/user/labdata/new-data.csv",
                      header=True,
                      sep=";",
                      inferSchema=True)
data = data.selectExpr(
    *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])
Exemplo n.º 32
0
def getSparkSessionInstance(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']
if __name__ == "__main__":
	# Spark Context
	sc = SparkContext("local[2]",appName = "StreamingReviews")
	sc.setLogLevel("ERROR")
	# Update Stream every 10 seconds
	ssc = StreamingContext(sc,10)
	# Load Model 
	lr_model = PipelineModel.load('./Model')
	#Create DStream from data source
	lines = ssc.textFileStream('./Test')
	#Transformations and actions on DStream
	text = lines.map(lambda x: x[1:-1])
	def process(time, rdd):
		print("========= %s =========" % str(time))
		try:
			# Get the singleton instance of SparkSession
			spark = getSparkSessionInstance(rdd.context.getConf())
			# Remove Header
			head = rdd.first()
			rdd = rdd.filter(lambda x: x != head)
			# Convert RDD[String] to RDD[Row] to DataFrame
			rowRdd = rdd.map(lambda w: Row(text=w.encode('utf-8')))
			# Create new Data Frame
Exemplo n.º 33
0
def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))
Exemplo n.º 34
0
		# spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
		spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.cn-northwest-1.amazonaws.com.cn")

	return spark



if __name__ == '__main__':
	spark = prepare()

	# 1. load the data
	df_result = load_training_data(spark)
	df_validate = df_result #.select("id", "label", "features").orderBy("id")

	# 2. load model
	model = PipelineModel.load("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/rf")

	# 3. compute accuracy on the test set
	predictions = model.transform(df_validate)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g " % (1.0 - accuracy))
	print("Test set accuracy = " + str(accuracy))

	# 4. Test with Pharbers defined methods
	result = predictions
	# result.printSchema()
	result = result.withColumn("JACCARD_DISTANCE_MOLE_NAME", result.JACCARD_DISTANCE[0]) \
				.withColumn("JACCARD_DISTANCE_DOSAGE", result.JACCARD_DISTANCE[1]) \
				.drop("JACCARD_DISTANCE", "features", "indexedFeatures").drop("rawPrediction", "probability")
	# result.orderBy("id").repartition(1).write.mode("overwrite").csv("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/result")
Exemplo n.º 35
0
#   Run App
#
################################################################################################

if __name__ == "__main__":
    
    from pyspark.sql import SparkSession
    from pyspark.ml import PipelineModel
    
    spark = SparkSession \
        .builder \
        .config("spark.driver.allowMultipleContexts", "true") \
        .appName("pyspark_nfl_app") \
        .getOrCreate()
    
    model_pass = PipelineModel.load('/assets/static/assets/nfl_model_pass')
    model_run  = PipelineModel.load('/assets/static/assets/nfl_model_run')    
    #model_pass = PipelineModel.load('./static/assets/nfl_model_pass')
    #model_run  = PipelineModel.load('./static/assets/nfl_model_run')
    
    #app.run(debug=True, threaded=False, host='0.0.0.0', port=4444)
    app.run(threaded=False, host='0.0.0.0', port=4444)



'''

0   Date
1   GameID
2   Drive
3   qtr