Exemplo n.º 1
0
'''
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')

from pyspark import keyword_only
from pyspark.ml import Model
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol
from pyspark.sql.types import *
from pyspark.ml.feature import *
import pyspark.sql.functions as F
#from pyspark.ml import Estimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from joblib import load, dump
from pyspark.ml import Estimator
from pickle import loads, dumps
import base64
from pyspark.ml import Pipeline

from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from model import pipeline

df = spark.read.json(sys.argv[1])

pipeline_model = pipeline.fit(df)

pipeline_model.write().overwrite().save(sys.argv[2])
Exemplo n.º 2
0
#
# Read script arguments
#
try:
    train_path = sys.argv[1]
    model_path = sys.argv[2]
except:
    logging.critical("Need to pass both train dataset path and model path")
    sys.exit(1)

logging.info(f"TRAIN_PATH {train_path}")
logging.info(f"MODEL_PATH {model_path}")

#
# model importing
#
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')

from model import pipeline

#
# training
#
dataset = spark.read.json(train_path, multiLine=True)
dataset_cleaned = dataset.select("id", "reviewText", "overall")
pipeline_model = pipeline.fit(dataset_cleaned)

pipeline_model.write().overwrite().save(model_path)
Exemplo n.º 3
0
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')

from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
import sys
from model import pipeline, sklearn_est, vectorToArray, predict
import pickle

train_path = sys.argv[1]
pipeline_path = sys.argv[2]
model_path = sys.argv[3]

train = spark.read.json(train_path)
pipeline_model = pipeline.fit(train)
train = pipeline_model.transform(train)
train = train.withColumn("features_array", vectorToArray("features")).localCheckpoint()
df = train.select('label', 'features_array').toPandas()
sklearn_est = sklearn_est.fit(df['features_array'].tolist(), df['label'].tolist())

pipeline_model.write().overwrite().save(pipeline_path)

with open(model_path, "wb") as f:
    pickle.dump(sklearn_est, f)
Exemplo n.º 4
0
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')

from model import pipeline
import os
import sys

path_to_data = sys.argv[1]
model_path = sys.argv[2]

# Считываем трейн
data = spark.read.json(path_to_data)
data = data.select('reviewText', 'overall')

# Обучаем модель
pipeline_model = pipeline.fit(data)

# Сохраняем модель
pipeline_model.write().overwrite().save(model_path)