示例#1
0
def main(base_path):

    APP_NAME = 'train_model'
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        spark = pyspark.sql.SparkSession.builder \
            .master("local[4]") \
            .appName('train model') \
            .getOrCreate()

    PROJECT_HOWE = "/home/ubuntu/capstone/"

    #loading data as the desired format
    from pyspark.sql.types import *
    from pyspark.sql.functions import col, udf
    schema = StructType([
        StructField('NewIndex', IntegerType(), False),
        StructField('Index', IntegerType(), False),
        StructField('ItemID', IntegerType(), False),
        StructField('Sentiment', IntegerType(), False),
        StructField('SentimentSource', StringType(), False),
        StructField('SentimentText', StringType(), False),
    ])

    filename = PROJECT_HOME + "nokaggle-Sentiment-Analysis-Dataset.csv"
    users = spark.read.csv(filename, header=True, schema=schema)

    #model training stages
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import HashingTF, IDF, IDFModel, StopWordsRemover
    from pyspark.ml.classification import RandomForestClassifier
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

    pp_udf = udf(preprocess, ArrayType(StringType()))
    words = users.withColumn('Words', pp_udf(users.SentimentText))

    #remove stop words
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    removed = remover.transform(words)

    #rename column and select columns
    filtered = removed.select('ItemID',
                              col('Sentiment').alias('label'), 'SentimentText',
                              'Words', 'filtered')

    #perform term frequency
    hashingTF = HashingTF(inputCol="filtered",
                          outputCol="rawFeatures",
                          numFeatures=200)
    featurized = hashingTF.transform(filled_featurized)

    #perform idf
    featurized.cache()
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    model = idf.fit(featurized)
    result = model.transform(featurized)

    #save idf and idf model
    idf_path = PROJECT_HOME + 'tmp/idf'
    idf.save(idf_path)
    idfmodel_path = PROJECT_HOME + 'tmp/idfmodel'
    model.save(idfmodel_path)
    #load via following
    #loadedModel = IDFModel.load(idfmodel_path)

    #fit single rf model
    rf = RandomForestClassifier(numTrees=100, labelCol="label", seed=42)
    rf_model = rf.fit(result)

    #Prepare Train Test Split
    train, test = result.randomSplit([0.8, 0.2], seed=42)

    # Configure an ML pipeline, which consists of tree stages: hashingTF, idf and RandomForestClassifier.
    rf = RandomForestClassifier(labelCol="label", seed=42)
    pipeline = Pipeline(stages=[rf])

    #grid search
    paramGrid = ParamGridBuilder().addGrid(rf.numTrees,
                                           [100]).addGrid(rf.maxDepth,
                                                          [5]).build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=3)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)

    # Make predictions on test documents. cvModel uses the best model found (lrModel).
    prediction = cvModel.transform(test)
    selected = prediction.select("SentimentText", "probability", "prediction")

    #cv score
    print(cvModel.avgMetrics)
    #test score
    evaluator = BinaryClassificationEvaluator()
    print("Test Score is {}".format(evaluator.evaluate(
        cvModel.transform(test))))
    cvModel.getEstimatorParamMaps()

    #save model
    cv_path = PROJECT_HOME + 'tmp/cv'
    cvModel.bestModel.save(cv_path)