def make_pipeline(): # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) return pipeline
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def pipe_line(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): rid, text, prob, prediction = row print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction)) spark.stop()
def test_pipeline(dataset_text): mlflow.pyspark.ml.autolog() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) inner_pipeline = Pipeline(stages=[hashingTF, lr]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) assert _get_pipeline_stage_hierarchy(pipeline) == { pipeline.uid: [tokenizer.uid, hashingTF.uid, lr.uid] } assert _get_pipeline_stage_hierarchy(nested_pipeline) == { nested_pipeline.uid: [tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, lr.uid] }] } for estimator in [pipeline, nested_pipeline]: with mlflow.start_run() as run: model = estimator.fit(dataset_text) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(estimator))) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.uid == model.uid assert run_data.artifacts == ["model", "pipeline_hierarchy.json"]
def test_pipeline(dataset_text): mlflow.pyspark.ml.autolog() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) inner_pipeline = Pipeline(stages=[hashingTF, lr]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) for estimator in [pipeline, nested_pipeline]: with mlflow.start_run() as run: model = estimator.fit(dataset_text) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values( _get_instance_param_map(estimator, uid_to_indexed_name_map))) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.uid == model.uid assert run_data.artifacts == ["estimator_info.json", "model"]
def get_data_transformers(): """ Creates Data Transformers :return: tokenizer, hasher, classifier :rtype: Tokenizer, HashingTF, MultilayerPerceptronClassifier """ # Tokenizer : Splits each name into words tokenizer = Tokenizer(inputCol="name", outputCol="words") # HashingTF : builds term frequency feature vectors from text data hasher = HTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=8) """ specify layers for the neural network: input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) """ # Network params maxIter = 20 layers = 8, 5, 4, 5, 2 blockSize = 128 seed = 1234 # Creating the trainer and set its parameters classifier = MultilayerPerceptronClassifier(maxIter=maxIter, layers=layers, blockSize=blockSize, seed=seed) return tokenizer, hasher, classifier
def main(): # Prepare training documents from a list of (id, text, label) tuples. spark = SparkSession.builder.appName("MLpipeline").getOrCreate() LabeledDocument = Row("id", "text", "label") training = spark.createDataFrame([(0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([(4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("id", "text", "prediction") for row in selected.collect(): print(row)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline( stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def pipeline(cleaned_dataframe, stopwordlist=None): """Pipeline for Tokenizing, removing stop words, and performing word count.""" tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens") if stopwordlist: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped", stopWords=stopwordlist) else: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped") count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(), outputCol="features") pipe_line = Pipeline(stages=[tokenizer, stop_remover, count_vect]) model = pipe_line.fit(cleaned_dataframe) featurized_data = model.transform(cleaned_dataframe) return featurized_data, model.stages[-1].vocabulary
def TrainModel(): # Configure an M3L pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to train_data documents. model = pipeline.fit(train_data) return model
def train_svm_idf(sqlContext, df): training, test = df.randomSplit([0.8, 0.2]) tokenizer = Tokenizer(inputCol="body", outputCol="words") hashingTF = HashingTF(numFeatures=2000, inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") svm = LinearSVC(featuresCol="features", labelCol="label") pipline = Pipeline(stages=[tokenizer, hashingTF, idf, svm]) model = pipline.fit(training) test_df = model.transform(test) train_df = model.transform(training) test_df.show() train_df.show() evaluator = BinaryClassificationEvaluator(labelCol="label") """rawPredictionCol="prediction",""" train_metrix = evaluator.evaluate(train_df) test_metrix = evaluator.evaluate(test_df) test_p = test_df.select("prediction").rdd.map( lambda x: x['prediction']).collect() test_l = test_df.select("label").rdd.map(lambda x: x['label']).collect() train_p = train_df.select("prediction").rdd.map( lambda x: x['prediction']).collect() train_l = train_df.select("label").rdd.map(lambda x: x['label']).collect() print("\n\n\n\n") print("-" * 15 + " OUTPUT " + "-" * 15) print() print("confusion matrix for trainning data") print(train_metrix) print("train label") print(train_l) print("train prediction") print(train_p) print("-" * 30) print() print("confusion matrix for testing data") print(test_metrix) print("test label") print(test_l) print("test prediction") print(test_p) print("-" * 30) print("\n\n\n\n")
def __init__(self, data): tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr]) self.model = pipeline.fit(data)
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None): """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF Returns transformed data as 'features' and the vocabulary of words.""" tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens") if stopwordlist: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped", stopWords=stopwordlist) else: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped") count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(), outputCol="Text_counts_raw") idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf]) model = pipeline.fit(cleaned_dataframe) featurized_data = model.transform(cleaned_dataframe) return featurized_data, model.stages[-2].vocabulary
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def apply(configProperties): # Reading configs from config properties. maxIterVal = int(configProperties.get("maxIter")) regParamVal = float(configProperties.get("regParam")) #numFeaturesVal: Int = configProperties.get("numFeatures").get.toInt # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=maxIterVal, regParam=regParamVal) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) return pipeline
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) with mlflow.start_run(): mlor_model = lor.fit(dataset_multinomial) assert _should_log_model(mlor_model) with mlflow.start_run(): ova1_model = ova1.fit(dataset_multinomial) assert _should_log_model(ova1_model) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) with mlflow.start_run(): pipeline_model = pipeline.fit(dataset_text) assert _should_log_model(pipeline_model) nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, lr])]) with mlflow.start_run(): nested_pipeline_model = nested_pipeline.fit(dataset_text) assert _should_log_model(nested_pipeline_model) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.LinearRegressionModel", "pyspark.ml.classification.OneVsRestModel", "pyspark.ml.pipeline.PipelineModel", }, ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning: lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert not _should_log_model(lor_model) mock_warning.called_once_with( _get_warning_msg_for_skip_log_model(lor_model)) assert not _should_log_model(ova1_model) assert not _should_log_model(pipeline_model) assert not _should_log_model(nested_pipeline_model)
def pipeline(self): from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, IDF from pyspark.ml.feature import Tokenizer from pyspark.ml.classification import LogisticRegression tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") lrClassifier = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lrClassifier]) return pipeline
def tune_pyspark_als(train_file, test_file, param_grid): """ Tune hyper parameters by using cross validation Args: train_file (string): path to train csv file test_file (string): path to train csv file param_grid() : hyper paramters to try for tuning example:param_grid = ParamGridBuilder() \ .addGrid(als.rank, [1,5,10,15]) \ .addGrid(als.maxIter, [24]) \ .addGrid(als.regParam, [.01]) \ .build() Returns: dict: best_params dict: best_score pyspark.ml.tuning.CrossValidatorModel: model contains best parameters """ spark = SparkSession.builder.appName('Sample').getOrCreate() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) als = ALS(userCol="User", itemCol="Movie", ratingCol="rating", nonnegative=True, implicitPrefs=False) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) data_train, data_test, data_actual_train, data_actual_predict = make_datasets( ) data_actual_train.to_csv("actual_train.csv") data = spark.read.format("csv").option("header", "true")\ .load("actual_train.csv") data = data.withColumn("User", data["User"].cast(IntegerType())) data = data.withColumn("Movie", data["Movie"].cast(IntegerType())) data = data.withColumn("rating", data["rating"].cast(IntegerType())) data = data.drop('_c0') cvModel = cv.fit(data) return cvModel
def getPipeline(self, df): # notify pipeline self.success('Initializing ML Pipeline ...') # initialize our tokenizer, we're going to tokenize features tokenizer = Tokenizer(inputCol='tag_features', outputCol='words') # convert the tokenize data to vectorize data hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features') # initialize logistic regression algorithm lr = LogisticRegression(maxIter=10, regParam=0.01) # create / initialize the ml pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # fit the pipeline on our training dataframe model = pipeline.fit(df) return model
def untest_save_load1(self): # sc = sparkEngine.getSparkSession() sc = SparkSession.builder.master("local").appName( "localtest").getOrCreate() training = sc.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) model.save("/var/lib/ml-app/test_save1") saved_model = model.load("/var/lib/ml-app/test_save1") print(saved_model)
def __init__(self): self.spark = SparkSession.builder.appName("classification").config( "spark.executor.memory", "70g").config("spark.driver.memory", "50g").config( "spark.memory.offHeap.enabled", True).config("spark.memory.offHeap.size", "16g").getOrCreate() schema = StructType([ StructField('id', LongType(), False), StructField('text', StringType(), False), StructField('label', DoubleType(), False) ]) training = self.spark.read.format("json").load("data/questions.json", schema=schema) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingtf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingtf, lr]) self.model = pipeline.fit(training) logging.info("Classification initialized...")
def main(spark, numTopics): jokesDF = spark.read.schema( StructType([ StructField("jokeID", IntegerType(), False), StructField("raw_text", StringType(), False), ])).csv("s3://aws-emr-resources-257018485161-us-east-1/jokes_3.csv", header="true") #jokesDF = jokesDF.withColumn("text", clean_text_udf("raw_text")) (training, test) = jokesDF.randomSplit([0.8, 0.2]) register_remove_punctuation_udf(spark) stopwords = spark.sparkContext.textFile( "s3://aws-emr-resources-257018485161-us-east-1/stopwords").collect() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(stopWords=stopwords, inputCol=tokenizer.getOutputCol(), outputCol="filtered") vectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", minDF=2) lda = LDA(k=numTopics) pipeline = Pipeline(stages=[ SQLTransformer( statement= "SELECT jokeID, remove_punctuation_udf(raw_text) text FROM __THIS__" ), tokenizer, remover, vectorizer, lda ]) model = pipeline.fit(training) model.write().overwrite().save( "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel") prediction = model.transform(test) prediction.show()
def test_get_params_to_log(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = get_params_to_log(lor) assert ( lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family) ) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = get_params_to_log(ova) assert ( ova_params["classifier"] == "LogisticRegression" and ova_params["labelCol"] == "abcd" and ova_params["LogisticRegression.maxIter"] == 3 and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family) ) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = get_params_to_log(pipeline) nested_pipeline_params = get_params_to_log(nested_pipeline) assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"] assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"] assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"] assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression" for params_to_test in [pipeline_params, nested_pipeline_params]: assert ( params_to_test["Tokenizer.inputCol"] == "text" and params_to_test["Tokenizer.outputCol"] == "words" ) assert params_to_test["HashingTF.outputCol"] == "features" assert params_to_test["OneVsRest.classifier"] == "LogisticRegression" assert params_to_test["LogisticRegression.maxIter"] == 3
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
class BaselinePipelineEngine(PipelineEngine): @keyword_only def __init__(self, cv): super(BaselinePipelineEngine, self).__init__(cv) self.hashing_tf_map = [pow(2, 20)] self.lr_map = [0.1, 0.01] self.stages = self._build_stages() self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
def train_validate(self, df): # Split the data into training and test sets (30% held out for testing) (training, test) = df.randomSplit([0.7, 0.3]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") hashingTF = HashingTF(numFeatures=10000, inputCol=remover.getOutputCol(), outputCol="features") #################### # lr = LogisticRegression(maxIter=10, regParam=0.001) # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) #################### # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr]) ##################### # Fit the pipeline to training documents. model = pipeline.fit(training) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(prediction) print("Test Error : " + str(1 - accuracy)) return model
def test_get_instance_param_map(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = _get_instance_param_map(lor) assert (lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family)) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = _get_instance_param_map(ova) assert (ova_params["classifier"] == lor.uid and ova_params["labelCol"] == "abcd" and ova_params[f"{lor.uid}.maxIter"] == 3 and ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family)) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = _get_instance_param_map(pipeline) nested_pipeline_params = _get_instance_param_map(nested_pipeline) assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid] assert nested_pipeline_params["stages"] == [ tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, ova.uid] }, ] for params_to_test in [pipeline_params, nested_pipeline_params]: assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text" and params_to_test[f"{tokenizer.uid}.outputCol"] == "words") assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features" assert params_to_test[f"{ova.uid}.classifier"] == lor.uid assert params_to_test[f"{lor.uid}.maxIter"] == 3
# Prepare training documents, which are labeled. LabeledDocument = Row('id', 'text', 'label') training = sqlCtx.inferSchema( sc.parallelize([(0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0)]) .map(lambda x: LabeledDocument(*x))) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer() \ .setInputCol("text") \ .setOutputCol("words") hashingTF = HashingTF() \ .setInputCol(tokenizer.getOutputCol()) \ .setOutputCol("features") lr = LogisticRegression() \ .setMaxIter(10) \ .setRegParam(0.01) pipeline = Pipeline() \ .setStages([tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled. Document = Row('id', 'text') test = sqlCtx.inferSchema( sc.parallelize([(4L, "spark i j k"), (5L, "l m n"),
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder sc = SparkContext("local", "Simple App") spark = SparkSession.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()
spark = SparkSession\ .builder\ .appName("SimpleTextClassificationPipeline")\ .getOrCreate() # Prepare training documents, which are labeled. training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled. test = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop") ], ["id", "text"]) # Make predictions on test documents and print columns of interest.
for f in filters: s = f(s) return s # %% dataSet = dataSet.withColumn('cleanReview', cleanText(F.col('reviews'))).filter(F.col('cleanReview') != '') dataSet.show() # %% trainDF, testDF = dataSet.randomSplit([0.8, 0.2]) # trainDF.show() # testDF.show() # %% tokenizer = Tokenizer(inputCol="cleanReview", outputCol="tokens") word2vec = Word2Vec(vectorSize=200, minCount=10, numPartitions=10, inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, word2vec]) pipelineModel = pipeline.fit(trainDF) # %% pTrainDF = pipelineModel.transform(trainDF) pTestDF = pipelineModel.transform(testDF) # %% pTrainDF = pTrainDF.withColumn('class', pTrainDF['class'].cast(IntegerType())) pTestDF = pTestDF.withColumn('class', pTestDF['class'].cast(IntegerType())) # %% rForest = RandomForestClassifier(labelCol='class', featuresCol='features') rForestModel = rForest.fit(pTestDF)
def main(args): textFiles = sc.wholeTextFiles(maindir + '4').map(readContents) #print "READ second {} check ".format(textFiles.take(10)) ''' filter the rows based on all the index available in training file else drop http://stackoverflow.com/questions/24718697/pyspark-drop-rows ''' htmldf = sqlContext.createDataFrame(textFiles) htmldf.cache() traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf) traindf.write.save(maindir+"output/train_4.parquet", format="parquet") # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.01) rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label") rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4) #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html #w2v = Word2Vec(inputCol="text", outputCol="w2v") rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(traindf) print '-----------------------------------------------------------------------------' testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf) #print testdf.count() # Make predictions on test documents and print columns of interest. prediction = model.transform(testdf) #print('prediction', prediction) ''' pand = prediction.toPandas() pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8') print "Done!!! CSV" ''' #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv') # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double, # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double]) ''' #write in scala selected = prediction.select("id", "probability", "prediction") for row in selected.collect(): print row ''' sc.stop()
from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") trainDF.cache() # to be used again for model with cross validation # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") testDF.cache() # to be used again for model with cross validation prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() areaUnderROC = evaluator.evaluate(prediction) # MODEL SELECTION WITH CROSS VALIDATION # Parameter grid for cross validation: numFeatures and regParam
if __name__ == "__main__": sc = SparkContext(appName="SimpleTextClassificationPipeline") sqlContext = SQLContext(sc) # Prepare training documents, which are labeled. LabeledDocument = Row("id", "text", "label") training = sc.parallelize([(0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0)]) \ .map(lambda x: LabeledDocument(*x)).toDF() # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled. Document = Row("id", "text") test = sc.parallelize([(4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop")]) \ .map(lambda x: Document(*x)).toDF()
def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame( [ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"], ) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(5) lr2 = LogisticRegressionCls().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = (ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100]).addGrid( ova.classifier, [lr1, lr2]).build()) crossval = CrossValidator( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=2, ) # use 3+ folds in practice cvPath = temp_path + "/cv" crossval.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedCV.getEstimator().uid, crossval.getEstimator().uid) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(cvModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, cvModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) crossval2 = CrossValidator( estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=2, ) # use 3+ folds in practice cv2Path = temp_path + "/cv2" crossval2.save(cv2Path) loadedCV2 = CrossValidator.load(cv2Path) self.assert_param_maps_equal(loadedCV2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedCV2.getEstimator().uid, crossval2.getEstimator().uid) # Run cross-validation, and choose the best set of parameters. cvModel2 = crossval2.fit(training) # test save/load of CrossValidatorModel cvModelPath2 = temp_path + "/cvModel2" cvModel2.save(cvModelPath2) loadedModel2 = CrossValidatorModel.load(cvModelPath2) self.assertEqual(loadedModel2.bestModel.uid, cvModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = cvModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip( loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
positiveTrainTmp = posTrainTmp1.select(posTrainTmp1.Id, posTrainTmp1.Flag) positiveTest = positive.join( positiveTrainTmp, positive.Id == positiveTrainTmp.Id, "LeftOuter").\ filter("Flag is null").\ select(positive.Id, positive.Text, positive.Label) testing = negativeTest.unionAll(positiveTest) # CREATE MODEL numFeatures = 20000 numEpochs = 20 regParam = 0.02 tokenizer = Tokenizer().setInputCol("Text").setOutputCol("Words") hashingTF = HashingTF().setNumFeatures(numFeatures).\ setInputCol(tokenizer.getOutputCol()).setOutputCol("Features") lr = LogisticRegression().setMaxIter(numEpochs).setRegParam(regParam).\ setFeaturesCol("Features").setLabelCol("Label").\ setRawPredictionCol("Score").setPredictionCol("Prediction") pipeline = Pipeline().setStages([tokenizer, hashingTF, lr]) # this comand takes a time model = pipeline.fit(training) testTitle = "Easiest way to merge a release into one JAR file" testBody = """Is there a tool or script which easily merges a bunch of href="http://en.wikipedia.org/wiki/JAR_%28file_format%29" >JAR</a> files into one JAR file? A bonus would be to easily set the main-file manifest and make it executable. I would like to run it with something like: </p>

<blockquote>
 <p>java -jar rst.jar</p>
</blockquote>

<p>
def cleanLower(doc): return doc.replace("<br /><br />"," ").lower() rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words') hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision']) # .addGrid(dt.maxDepth, [10,20])
from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer # In[17]: # Prepare training documents from a list of (id, text, label) tuples. training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # In[18]: # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # In[19]: # Fit the pipeline to training documents. model = pipeline.fit(training) # In[20]: # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], ["id", "text"])
##Split training and testing (trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) print trainingData.count() print testData.count() testData.collect() #Setup pipeline from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.feature import IDF tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \ outputCol="tempfeatures") idf=IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") nbClassifier=NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, \ idf, nbClassifier]) nbModel=pipeline.fit(trainingData) prediction=nbModel.transform(testData) #prediction.where(prediction.prediction == 1.0).show() prediction.groupBy("label","prediction").count().show()
distinct_labels[curr_cat] = category_dir next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat)))) curr_cat += 1 training_rows = docs.sample(False, train_fraction) testing_rows = docs.subtract(training_rows) # Prepare training and test documents, which are labeled. LabeledDocument = Row("text", "label") train = training_rows.map(lambda x: LabeledDocument(*x)).toDF() test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF() # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=1000, regParam=0.001) #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr]) #m0 = p0.fit(train) #pipeline = Pipeline(stages=[m0, lr]) pipeline = p0 # Fit the pipeline to training documents. model = pipeline.fit(train) print('\n\n --------------- RESULT ----------------------\n\n') print(model.transform(test).head()) print('\n\n ---------------------------------------------\n\n')
# être que les pipelines c'est faisables. A voir df_test_words = tokenizer.transform(dfTest) df_test_tf = htf.transform(df_test_words) df_test_tfidf = idfModel.transform(df_test_tf) df_test_final = string_indexer_model.transform(df_test_tfidf) # Les prédictions df_test_pred = dt_model.transform(df_test_final) df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show(5) # Je fais un pipeline très basique from pyspark.ml import Pipeline # Instanciate all the Estimators and Transformers necessary tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words') hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf', numFeatures=10000) idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) # Instanciate a Pipeline pipeline = Pipeline(stages=[tokenizer,hashing_tf,idf,string_indexer,dt]) pipeline_model = pipeline.fit(dfTrain) df_test_pred = pipeline_model.transform(dfTest) df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show() # Un outil automatique pour calculer le taux de bonne classif. # La encore pas très utile en vrai from pyspark.ml.evaluation import MulticlassClassificationEvaluator