def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def train_lg(training_data, collection): # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr. hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") pipeline1 = Pipeline(stages=[hashingTF, idf]) # Fit the pipeline1 to training documents. model1 = pipeline1.fit(training_data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline2 = Pipeline(stages=[model1, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training_data) # model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_' # + collection["Id"] + '_' # + collection["name"]) # cvModel.save(sc, model_path) return cvModel
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def model(classifiers, training, testing, week): results = "" timing = [] for classifier in classifiers: timeStart = time.time() clf = get_classifier(classifier) labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(training) prediction = model.transform(testing) metrics = BinaryClassificationMetrics(prediction.select("label","prediction").rdd) results = results + "new," + classifier + "," + week + "," + str(metrics.areaUnderROC) + "," +str(metrics.areaUnderPR) + "\n" timing.append(time.time()-timeStart) return results, timing
def main(sc, spark): # Load the Corpus corpus = load_corpus(sc, spark) # Create the vector/cluster pipeline pipeline = Pipeline(stages=[ Tokenizer(inputCol="text", outputCol="tokens"), Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"), BisectingKMeans(k=10, featuresCol="vecs", maxIter=10), ]) # Fit the model model = pipeline.fit(corpus) corpus = model.transform(corpus) # Evaluate clustering. bkm = model.stages[-1] cost = bkm.computeCost(corpus) sizes = bkm.summary.clusterSizes # TODO: compute cost of each cluster individually # Get the text representation of each cluster. wvec = model.stages[-2] table = [["Cluster", "Size", "Terms"]] for ci, c in enumerate(bkm.clusterCenters()): ct = wvec.findSynonyms(c, 7) size = sizes[ci] terms = " ".join([row.word for row in ct.take(7)]) table.append([ci, size, terms]) # Print Results print(tabulate(table)) print("Sum of square distance to center: {:0.3f}".format(cost))
def testLogisticMLPipeline1(self): training = sqlCtx.createDataFrame([ ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), ("hadoop mapreduce", 2.0), ("b spark who", 1.0), ("g d a y", 2.0), ("spark fly", 1.0), ("was mapreduce", 2.0), ("e spark program", 1.0), ("a e c l", 2.0), ("spark compile", 1.0), ("hadoop software", 2.0) ], ["text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) lr = LogisticRegression(sqlCtx) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = sqlCtx.createDataFrame([ ("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), ("apache hadoop", 2.0)], ["text", "label"]) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator() score = evaluator.evaluate(predictionAndLabels) self.failUnless(score == 1.0)
def main(input_file): # Load and parse the data file, converting it to a DataFrame. data = MLUtils.loadLabeledPoints(sc, input_file) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse)) rfModel = model.stages[1] print(rfModel) # summary only
def test_nnclassifier_in_pipeline(self): if self.sc.version.startswith("1"): from pyspark.mllib.linalg import Vectors df = self.sqlContext.createDataFrame( [(Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), (Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), ], ["features", "label"]) scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled") model = Sequential().add(Linear(2, 2)) criterion = ClassNLLCriterion() classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\ .setBatchSize(4) \ .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled") pipeline = Pipeline(stages=[scaler, classifier]) pipelineModel = pipeline.fit(df) res = pipelineModel.transform(df) assert type(res).__name__ == 'DataFrame'
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred): lp_data= get_labeled_points(start1, end2, df, sc, sql_context) print lp_data.count() labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data) td = labelIndexer.transform(lp_data) label2index = {} for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]), key=lambda x: x[0]): label2index[int(each[0])] = int(each[1]) print label2index featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data) rf = get_model() pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1) model = pipeline.fit(lp_train) lp_check = lp_data.filter(lp_data.date2>start2) predictions = model.transform(lp_check) predictions = val(predictions, label2index, sql_context) if is_pred: predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc()) dfToTableWithPar(sql_context, predictions, "predictions", get_cur()) for each in predictions.take(10): print each
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def model(classifier, ftrain, fvalid, fprediction): startTime = time.time() ctx = SparkContext(appName="model_on_Spark") sqlContext = SQLContext(ctx) logger = SparkLogger(ctx) logger.set_level('ERROR') # load and prepare training and validation data rawTrain, train = prepData(sqlContext, ctx, ftrain) rawValid, valid = prepData(sqlContext, ctx, fvalid) # is needed to join columns valid = indexData(valid) rawValid = indexData(rawValid) classifiers = { "RandomForestClassifier" : RFC } clf = classifiers[classifier]() labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # train and predict pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(train) predictions = model.transform(valid) # write to file: subsetPrediction = predictions.select("prediction", "index") subsetValidData = rawValid.select("dataset", "index") output = (subsetValidData .join(subsetPrediction, subsetPrediction.index == subsetValidData.index) .drop("index") .drop("index")) lines = output.map(toCSVLine) lines.saveAsTextFile('output') evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Test Error = %g" % (1.0 - accuracy) executionTime = time.time() - startTime row=classifier+','+str(executionTime) ctx.parallelize([row]).saveAsTextFile("timing")
def event_pipeline(dataset): """ """ EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI") EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI") EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI") assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeI", "EventBaseCodeI","EventRootCodeI", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features") featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=310) pipeline = Pipeline(stages=[EventCodeI, EventBaseCodeI, EventRootCodeI,assembler,featureIndexer]) model = pipeline.fit(dataset) output = model.transform(dataset) data = output.map(lambda row: LabeledPoint(row[0], row[-1])).cache() print "Data:" print data.take(1) return data
def group(self): reTokenizer = RegexTokenizer(inputCol=self.query_colname, outputCol="words", minTokenLength=2) #, pattern='\W' hashingTF = HashingTF(numFeatures=self.num_features, inputCol="words", outputCol="tf") if self.idf == True: idf = IDF(minDocFreq=self.min_doc_freq, inputCol="tf", outputCol="idf") kmeans = KMeans(featuresCol="idf", predictionCol="cluster_id", k=self.n) pipeline = Pipeline(stages=[reTokenizer, hashingTF, idf, kmeans]) else: kmeans = KMeans(featuresCol="tf", predictionCol="cluster_id", k=self.n) pipeline = Pipeline(stages=[reTokenizer, hashingTF, kmeans]) model = pipeline.fit(self.df) prediction = model.transform(self.df) return prediction
def getPipeline(self, df): # notify pipeline self.success('Initializing ML Pipeline ...') # initialize our tokenizer, we're going to tokenize features tokenizer = Tokenizer(inputCol='tag_features', outputCol='words') # convert the tokenize data to vectorize data hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features') # initialize logistic regression algorithm lr = LogisticRegression(maxIter=10, regParam=0.01) # create / initialize the ml pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # fit the pipeline on our training dataframe model = pipeline.fit(df) return model
def event_pipeline(dataset): EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI") EventCodeV = OneHotEncoder(dropLast=True, inputCol="EventCodeI", outputCol="EventCodeV") EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI") EventRootCodeV = OneHotEncoder(dropLast=True, inputCol="EventRootCodeI", outputCol="EventRootCodeV") EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI") EventBaseCodeV = OneHotEncoder(dropLast=True, inputCol="EventBaseCodeI", outputCol="EventBaseCodeV") assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeV", "EventBaseCodeV","EventRootCodeV", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features") pipeline = Pipeline(stages=[EventCodeI, EventCodeV, EventRootCodeI, EventRootCodeV,EventBaseCodeI,EventBaseCodeV,assembler]) model = pipeline.fit(dataset) output = model.transform(dataset) data = output.map(lambda row: LabeledPoint(row[0], row[-1])).toDF().cache() return data
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def build_decision_tree(sqlContext, features, interested): print '-----------------------------------------' data = sqlContext.createDataFrame( [Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))]) data.printSchema() data.show(5) print 'created data frame' # Index the label column & adding metadata. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) print 'created label indexer' # Mark the features with < 4 distinct values as categorical featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.8, 0.2]) # Train a DecisionTree model dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) # Chain the indexers together with DecisionTree pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train the model model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) & compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") precision = evaluator.evaluate(predictions) treeModel = model.stages[2] return (1 - precision, model)
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["filePath"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) model0, transformer1, model2, transformer3 = pipeline_model.stages self.assertEqual(0, model0.dataset_index) self.assertEqual(0, model0.getFake()) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.getFake()) self.assertEqual(2, dataset.index) self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.") self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def build_ngrams_wocs(inputCol=["Text","Sentiment"], n=3): tokenizer = [Tokenizer(inputCol="Text", outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)] assembler = [VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="features" )] label_stringIdx = [StringIndexer(inputCol = "Sentiment", outputCol = "label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr) pipeline = Pipeline(stages=[tokenizer, ngrams, cv, idf,assembler, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df)
document_assembler = DocumentAssembler()\ .setInputCol(“text”)\ .setOutputCol(“document”) sentenceDetector = SentenceDetector()\ .setInputCols([“document”])\ .setOutputCol(“sentences”) tokenizer = Tokenizer() \ .setInputCols([“sentences”]) \ .setOutputCol(“token”) normalizer = Normalizer()\ .setInputCols([“token”])\ .setOutputCol(“normal”) word_embeddings=WordEmbeddingsModel.pretrained()\ .setInputCols([“document”,”normal”])\ .setOutputCol(“embeddings”) nlpPipeline = Pipeline(stages=[ document_assembler, sentenceDetector, tokenizer, normalizer, word_embeddings, ]) pipelineModel = nlpPipeline.fit(df) ### LightPipeline(someTrainedPipeline).annotate(someStringOrArray)
for indexer in indexers ] assembler_onehot = ft.VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="onehot_features") #scaler assembler_numeric = ft.VectorAssembler(inputCols=numeric_features, outputCol="numeric_features") std_scaler = ft.StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled") assembler_final = ft.VectorAssembler( inputCols=['onehot_features', 'numeric_features_scaled'], outputCol="final_features") pca_model = ft.PCA(k=6, inputCol="final_features", outputCol="pca_features") pipeline = Pipeline(stages=indexers + encoders + [ assembler_onehot, assembler_numeric, std_scaler, assembler_final, pca_model ]) preprocess_model = pipeline.fit(df) scaledData = preprocess_model.transform(df) # 保存和加载模型,save model load model from pyspark.ml import PipelineModel outpath = "/dbfs/classification_models/model-maxDepth{}-maxBins{}".format( MAXDEPTH, MAXBINS) pipelineModel.write().overwrite().save(outpath) model_in = PipelineModel.load(outpath)
resultF = result.select("result", "OpenStatus_cat") resultF.show() # In[16]: final_data = resultF.select('result', 'OpenStatus_cat') train_data, test_data = final_data.randomSplit([0.7, 0.3]) train_data.describe().show() # In[17]: dt = DecisionTreeClassifier(labelCol="OpenStatus_cat", featuresCol="result") pipeline = Pipeline(stages=[dt]) model = pipeline.fit(train_data) # In[18]: predictions = model.transform(test_data) predictions.select("prediction", "OpenStatus_cat", "result").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="OpenStatus_cat", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[0]
encoded_cols = ["job", "marital", "education", "month", "pdays"] indexers = [ StringIndexer().setInputCol(col).setOutputCol(col + "_labeled") for col in labeled_cols + encoded_cols ] # One Hot Encoders encoders = [ OneHotEncoder(inputCol=col + "_labeled", outputCol=col + "_encoded") for col in encoded_cols ] # Create the preprocessing timeline input_stages = assemblers + scalers + indexers + encoders pipeline = Pipeline(stages=input_stages) preprocessor = pipeline.fit(df) df = preprocessor.transform(df) # Remove unnecessary columns labeled_cols += [s + "_labeled" for s in encoded_cols] feature_list += [s + "_vec" for s in feature_list] df = df.drop(*feature_list, *labeled_cols, *encoded_cols, "features") # Write the pre-processed data to a csv file df.write.csv("bank-processed.csv", header=True) # Train test split (trainingData, testData) = df.randomSplit([0.8, 0.2]) # Create a vector for all the features features = [col for col in df.columns if col != "y_labeled"]
def run_pipeline(target, feature, num, cat, n_feat): train = spark.table('predictmodel_train') train.persist() stages = [] indexer = [StringIndexer(inputCol=s, outputCol=s+'_indexed',handleInvalid='keep') for s in cat] encoder = [OneHotEncoderEstimator(inputCols=[s+'_indexed'],outputCols=[s+"_encoded"],handleInvalid='keep') for s in cat] imputer = [Imputer(strategy='mean',inputCols=num, outputCols=num)] num_assmblr = [VectorAssembler(inputCols=[n], outputCol=n+'_vect') for n in num] num_scaler = [MinMaxScaler(inputCol=n+'_vect', outputCol=n+"_scaled") for n in num] # pipeline_num = Pipeline(stages=indexer + encoder + imputer + num_assmblr + num_scaler) # train = pipeline_num.fit(train).transform(train) # print("*** show encoded categorical variables ....") # train.select(*[s+'_encoded' for s in cat]).show(10, truncate=False) # unpack_list = F.udf(lambda x: round(float(list(x)[0]),3), DoubleType()) # for n in num: # train = train.withColumn(n+"_scaled", unpack_list(n+"_scaled")).drop(n+"_vect") # print("*** show scaled numeric variables ....") # train.select(*[n+'_scaled' for n in num]).summary("count", "min", "25%", "75%", "max").show(10, truncate=False) # assembler = VectorAssembler(inputCols=[num_scaler.getOutputCol()] + [s+"_encoded" for s in cat], outputCol=feature) assembler = VectorAssembler(inputCols=[n+'_scaled' for n in num] + [s+"_encoded" for s in cat], outputCol=feature) target_indexed = target+"_indx" labelIndexer = StringIndexer(inputCol = target, outputCol = target_indexed, handleInvalid = 'keep') model = clf_rf(feature, target_indexed) # model = clf_gbt(feature, target) # model = clf_lr(feature, target) validator = _val(target_indexed, model) stages += [assembler, labelIndexer, validator] print('*** stages are created and now are running... ***') pipeline = Pipeline(stages=indexer + encoder + imputer + num_assmblr + num_scaler + stages) pipeline_model = pipeline.fit(train) last_stage = pipeline_model.stages[-1] transformedData = pipeline_model.transform(train) transformedData.write.mode('overwrite').saveAsTable('us_marketing_usecase.transformedData') print('*** transformed data is saved for modeling... ***') # print('*** Transformed training set ***') # cols = num + cat # transformedData.select(target_indexed,feature,*cols).show(10, truncate=False) # ============================================================================= # RandomForest/GradientBoosting # ============================================================================= print('*** Model performance ***') evaluate(transformedData,target_indexed) print('*** Feature Importances ***') featImp = last_stage.bestModel.featureImportances print('*** Show important ' + str(n_feat) + ' features ***') list_extract = [] for i in transformedData.schema['features'].metadata["ml_attr"]["attrs"]: list_extract = list_extract + transformedData.schema['features'].metadata["ml_attr"]["attrs"][i] pd.set_option('display.max_rows', 500) varlist = pd.DataFrame(list_extract) varlist['score'] = varlist['idx'].apply(lambda x:featImp[x]) selected_feat = varlist.sort_values('score', ascending=False) print(selected_feat.iloc[0:n_feat, :]) # Get the best hyperparameters: print('MaxDepth: ' + str(last_stage.bestModel._java_obj.getMaxDepth())) print('NumTrees: ' + str(last_stage.bestModel.getNumTrees)) # ============================================================================= # Logistic Regression # ============================================================================= # print('*** Model performance ***') # evaluate(transformedData,target) # # print('*** Model feature attributes ***') # trainingSummary = last_stage.bestModel.summary # trainingSummary.roc.show() # print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # ============================================================================= # Prediction and Evaluation # ============================================================================= predicted = predict(pipeline_model,target_indexed) evaluate(predicted,target_indexed) train.unpersist()
def main(): parser = argparse.ArgumentParser(description="app inputs and outputs") parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket") parser.add_argument("--s3_input_key_prefix", type=str, help="s3 input key prefix") parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket") parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix") args = parser.parse_args() spark = SparkSession.builder.appName("PySparkApp").getOrCreate() # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set( "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([ StructField("sex", StringType(), True), StructField("length", DoubleType(), True), StructField("diameter", DoubleType(), True), StructField("height", DoubleType(), True), StructField("whole_weight", DoubleType(), True), StructField("shucked_weight", DoubleType(), True), StructField("viscera_weight", DoubleType(), True), StructField("shell_weight", DoubleType(), True), StructField("rings", DoubleType(), True) ]) # Downloading the data from S3 into a Dataframe total_df = spark.read.csv(('s3://' + os.path.join( args.s3_input_bucket, args.s3_input_key_prefix, 'abalone.csv')), header=False, schema=schema) #StringIndexer on the sex column which has categorical value sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex") #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex) sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec") #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format assembler = VectorAssembler(inputCols=[ "sex_vec", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight" ], outputCol="features") # The pipeline comprises of the steps added above pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler]) # This step trains the feature transformers model = pipeline.fit(total_df) # This step transforms the dataset with information obtained from the previous fit transformed_total_df = model.transform(total_df) # Split the overall dataset into 80-20 training and validation (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2]) # Convert the train dataframe to RDD to save in CSV format and upload to S3 train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features)) train_lines = train_rdd.map(csv_line) train_lines.saveAsTextFile('s3://' + os.path.join( args.s3_output_bucket, args.s3_output_key_prefix, 'train')) # Convert the validation dataframe to RDD to save in CSV format and upload to S3 validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features)) validation_lines = validation_rdd.map(csv_line) validation_lines.saveAsTextFile('s3://' + os.path.join( args.s3_output_bucket, args.s3_output_key_prefix, 'validation'))
indexers = [StringIndexer(inputCol=x, outputCol=x + '_tmp') for x in cols_now] encoders = [ OneHotEncoder(dropLast=False, inputCol=x + "_tmp", outputCol=y) for x, y in zip(cols_now, cols_now1) ] tmp = [[i, j] for i, j in zip(indexers, encoders)] tmp = [i for sublist in tmp for i in sublist] assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features') labelIndexer = StringIndexer(inputCol='binary_response', outputCol='label') tmp += [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) allData = pipeline.fit(joinrdd).transform(joinrdd) allData.cache() trainingData, testData = allData.randomSplit([0.8, 0.2], seed=0) # COMMAND ---------- # COMMAND ---------- # Decision Tree dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5)
# We can now define our classifier and pipeline. With this done, we can split our labeled data in train and test sets and fit a model. # # To train the decision tree, give it the feature vector column and the label column. # # Pipeline is defined by stages. Index plan column, label column, create vectors, then define the decision tree. from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier classifier = DecisionTreeClassifier(labelCol='label', featuresCol='features') pipeline = Pipeline( stages=[plan_indexer, label_indexer, assembler, classifier]) (train, test) = churn_data.randomSplit([0.7, 0.3]) model = pipeline.fit(train) # ## Model Evaluation # # The most important question to ask: # # Is my predictor better than random guessing? # # How do we quantify that? # Measure the area under the ROC curve, abreviated to AUROC. # # Plots True Positive Rate vs False Positive Rate for binary classification system # # [More Info](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) #
from pyspark.ml.pipeline import Pipeline labelPredDF1 = labelPredDF.withColumn( "proportion", (col("pos")) / (col("neg") + col("neu") / 3 + .000000001)) # When include neu in numerator, dominates it trainDF, testDF = labelPredDF1.randomSplit([.8, .2], seed=42) rf = RFormula(formula="label ~ neg + pos + neu + compound") lr = LogisticRegression(fitIntercept=True) ir = IsotonicRegression(featuresCol='proportion', predictionCol='prediction', isotonic=True) pipeline = Pipeline(stages=[ir]) pipelineModel = pipeline.fit(trainDF) testPredDF = pipelineModel.transform(testDF) # COMMAND ---------- display(testPredDF) # COMMAND ---------- # DBTITLE 1,Logistic Regression Model from pyspark.ml.feature import RFormula from pyspark.ml.classification import LogisticRegression from pyspark.ml.regression import IsotonicRegression from pyspark.ml.pipeline import Pipeline trainDF, testDF = labelPredDF.randomSplit([.8, .2], seed=42)
conf = SparkConf().setAppName("pet_adoption").setMaster("yarn") sc = SparkContext(conf=conf) # sc = init_nncontext("HowCute_train") sqlCtx = SQLContext(sc) df = sqlCtx.read.csv('hdfs:///project_data/pets/train/train.csv', header=True, inferSchema='True').drop('Name').drop( 'State') df_test = sqlCtx.read.csv('hdfs:///project_data/pets/train/train.csv', header=True, inferSchema='True').drop( 'Name').drop('State') # spark = SparkSession.builder.appName("pet_adoption").getOrCreate() ##pandas frame is easier to read # df_pd.drop('Name', axis=1, inplace=True) input_cols = [a for a, b in df.dtypes if b == 'int'] indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in ["AdoptionSpeed"]] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) df_test = pipeline.fit(df_test).transform(df_test) feature = VectorAssembler(inputCols=input_cols, outputCol="features") feature_vector = feature.transform(df) feature_vector_test = feature.transform(df_test) (trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11) testData.printSchema() #testData.show(10) lr = DecisionTreeClassifier(labelCol="AdoptionSpeed_index", featuresCol="features") lrModel = lr.fit(trainingData) lrModel.write().overwrite().save("hdfs:///treemodelofcsv") modelloaded = DecisionTreeClassificationModel.load("hdfs:///treemodelofcsv") lr_prediction = modelloaded.transform(testData) # lr_prediction.select("prediction", "Survived", "features").show()
subsamplingRate=1.0) rf = RandomForestClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=12345) pipe = Pipeline(stages=[featurizer, gb]) pipe_model = pipe.fit(train_df) predictions = pipe_model.transform(test_df) predictions.select("filePath", "prediction").show(10, False) predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) #ZEND
# Prepare training documents from a list of (id, text, label) tuples. training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop") ], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): rid, text, prob, prediction = row # type: ignore print(
exploder = TileExploder() # To "vectorize" the the band columns we use the SparkML `VectorAssembler` assembler = VectorAssembler() \ .setInputCols(bandColNames) \ .setOutputCol("features") # Configure our clustering algorithm k = 5 kmeans = KMeans().setK(k) # Combine the two stages pipeline = Pipeline().setStages([exploder, assembler, kmeans]) # Compute clusters model = pipeline.fit(joinedRF) # Run the data through the model to assign cluster IDs to each clustered = model.transform(joinedRF) clustered.show(8) # If we want to inspect the model statistics, the SparkML API requires us to go # through this unfortunate contortion: clusterResults = list( filter(lambda x: str(x).startswith('KMeans'), model.stages))[0] # Compute sum of squared distances of points to their nearest center metric = clusterResults.computeCost(clustered) print("Within set sum of squared errors: %s" % metric) tlm = joinedRF.tileLayerMetadata()
# compose a pipeline that includes feature transform, pretrained model and Logistic Regression transformer = ChainedPreprocessing([ RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageChannelNormalize(123.0, 117.0, 104.0), ImageMatToTensor(), ImageFeatureToTensor() ]) preTrainedNNModel = NNModel(Model.loadModel(model_path), transformer) \ .setFeaturesCol("image") \ .setPredictionCol("embedding") lrModel = Sequential().add(Linear(1000, 2)).add(LogSoftMax()) classifier = NNClassifier(lrModel, ClassNLLCriterion(), SeqToTensor([1000])) \ .setLearningRate(0.003).setBatchSize(40).setMaxEpoch(20).setFeaturesCol("embedding") pipeline = Pipeline(stages=[preTrainedNNModel, classifier]) catdogModel = pipeline.fit(trainingDF) predictionDF = catdogModel.transform(validationDF).cache() predictionDF.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictionDF) # expected error should be less than 10% print("Test Error = %g " % (1.0 - accuracy))
trainer = ADAG(keras_model=model, worker_optimizer='adam', loss='categorical_crossentropy', num_workers=1, batch_size=100, communication_window=5, num_epoch=50, features_col="matrix", label_col="label_encoded" ) trained_model = trainer.train(training_set) from distkeras.predictors import * from distkeras.transformers import * from distkeras.evaluators import * from distkeras.utils import * print("Training time: " + str(trainer.get_training_time())) print("Accuracy: " + str(evaluate_accuracy(trained_model, test_set))) print("Number of parameter server updates: " + str(trainer.parameter_server.num_updates)) from pyspark.ml import Pipeline pipeline = Pipeline(stages=[string_indexer, scaler, trainer_model]) from pyspark.mllib.evaluation import MulticlassMetrics fitted_pipeline = pipeline.fit(dataset_train) # Fit model to data prediction = fitted_pipeline.transform(dataset_train) # Evaluate on train data. # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data. pnl = prediction.select("index_category", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.index_category, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision())
stop_words_remover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(stop_words) # bag of words count count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_string_index = StringIndexer(inputCol="category", outputCol="label") label_string_index.setHandleInvalid("keep") pipeline = Pipeline(stages=[ regex_tokenizer, stop_words_remover, count_vectors, label_string_index ]) (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100) pipeline_fit = pipeline.fit(training_data) pipeline_fit.save("lr_pipeline") training_data_set = pipeline_fit.transform(training_data) training_data_set.show(5) # stages = pipeline_fit.stages # vec = [s for s in stages if isinstance(s, CountVectorizerModel)] # v1 = vec[0].vocabulary # print(len(v1)) print("Training: " + str(training_data_set.count())) print("Test: " + str(test_data.count())) lr = LogisticRegression(maxIter=100, regParam=0.2, elasticNetParam=0) lr_model = lr.fit(training_data_set)
).toDF() df_new = sc.parallelize( [ Row(p=u'p1', owner=u'u1', f1=0.1, f2=0.3, f3=0.5), Row(p=u'p2', owner=u'u1', f1=0.3, f2=0.5, f3=0.5), Row(p=u'p3', owner=u'u1', f1=0.6, f2=0.6, f3=0.9), Row(p=u'p4', owner=u'u1', f1=0.8, f2=0.1, f3=0.6), Row(p=u'p5', owner=u'u1', f1=0.0, f2=0.2, f3=0.2), Row(p=u'p1', owner=u'u2', f1=0.0, f2=0.4, f3=0.1), Row(p=u'p2', owner=u'u2', f1=0.3, f2=0.7, f3=0.4), Row(p=u'p3', owner=u'u2', f1=0.4, f2=0.6, f3=0.6), Row(p=u'p4', owner=u'u2', f1=0.6, f2=0.1, f3=0.7), Row(p=u'p5', owner=u'u2', f1=0.0, f2=0.0, f3=0.8), ] ).toDF() owner_training = df_training.where(col('owner') == 'u1') owner_new = df_new.where(col('owner') == 'u1') label_indexer = StringIndexer(inputCol="status", outputCol="indexedStatus") assembler = VectorAssembler(inputCols=['f1', 'f2', 'f3'], outputCol='features') rf = RandomForestClassifier(labelCol="indexedStatus", featuresCol="features") pipeline = Pipeline(stages=[label_indexer, assembler, rf]) model = pipeline.fit(owner_training) predictions = model.transform(owner_new) predictions.show()
stages += [stringIndexer, encoder] label_stringIdx = StringIndexer(inputCol='base_plan_id', outputCol='label', handleInvalid='skip') stages += [label_stringIdx] assemblerInputs = [c + "stringEnc" for c in string_cols] + numeric_cols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] import time start_time = time.time() pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(final_data) df = pipelineModel.transform(final_data) end_time = time.time() print("total time taken for Pipeline loop in seconds: ", end_time - start_time) selectedCols = ['label', 'features'] + final_data.columns df = df.select(selectedCols) #df.printSchema() # ## Random Forest Classification from pyspark.ml.classification import RandomForestClassifier ### MinMax Scaling from pyspark.ml.feature import MinMaxScaler
vectorAssembler = VectorAssembler()\ .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\ .setOutputCol("features") # COMMAND ---------- from pyspark.ml import Pipeline transformationPipeline = Pipeline()\ .setStages([indexer, encoder, vectorAssembler]) # COMMAND ---------- fittedPipeline = transformationPipeline.fit(trainDataFrame) # COMMAND ---------- transformedTraining = fittedPipeline.transform(trainDataFrame) # COMMAND ---------- from pyspark.ml.clustering import KMeans kmeans = KMeans()\ .setK(20)\ .setSeed(1L)
# Create 'features' vector: 'km', 'org_dummy', 'dow_dummy' assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features') # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) # Create a regression object and train on training data regression = LinearRegression(labelCol="duration") # Combine steps into a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # run fit on training data pipeline = pipeline.fit(flights_train) # Create predictions for the testing data and take a look at the predictions predictions = pipeline.transform(flights_test) print(predictions.toPandas().sample(12)) # Calculate the RMSE print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions)) # Print the coefficients and intercept for linear regression print("\nCoefficients: %s" % str(pipeline.stages[REGRESSION_STAGE].coefficients)) print("Intercept: %s" % str(pipeline.stages[REGRESSION_STAGE].intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = pipeline.stages[REGRESSION_STAGE].summary
tokenizer= Tokenizer(inputCol='text',outputCol='token_text') stop_remove=StopWordsRemover(inputCol='token_text',outputCol='stop_token') count_vec= CountVectorizer(inputCol='stop_token',outputCol='c_vec') idf=IDF(inputCol='c_vec',outputCol='tf_idf') ham_spam_to_numeric= StringIndexer(inputCol='class',outputCol='label') from pyspark.ml.feature import VectorAssembler clean_up= VectorAssembler(inputCols=['tf_idf','length'],outputCol='features') from pyspark.ml.classification import NaiveBayes nb= NaiveBayes() from pyspark.ml import Pipeline data_prep_pipe= Pipeline(stages=[ham_spam_to_numeric,tokenizer, stop_remove,count_vec,idf,clean_up]) cleaner=data_prep_pipe.fit(data) clean_data=cleaner.transform(data) clean_data.columns """ ['class', 'text', 'length', 'label', 'token_text', 'stop_token', 'c_vec', 'tf_idf', 'features'] """ clean_data=clean_data.select('label','features')
############# Entrenamiento del modelo ###### Palabras a vectores Word2Vec from pyspark.ml.feature import Word2Vec from pyspark.ml import Pipeline w2v = Word2Vec(vectorSize=100, minCount=0, inputCol="filtro_conectores", outputCol="vectores") redesSociales_word2vec_modelo = w2v.fit(redesSociales_df) redesSociales_df = redesSociales_word2vec_modelo.transform(redesSociales_df) redesSociales_df.select("filtro_conectores", "vectores").orderBy(rand()).show(5) modelo_sentimiento = clasificador_rl_pipeline.fit( aprendizajemaquina_entrenamiento_df) prediccion_sentimiento_redesSociales_df = modelo_sentimiento.transform( redesSociales_df) def prediccionLiteral(column): if column == 1.0: return "Bueno" else: return "Malo" prediccionLiteral_udf = udf(prediccionLiteral) prediccion_sentimiento_redesSociales_df=prediccion_sentimiento_redesSociales_df\ .withColumn('sentimiento',prediccionLiteral_udf(prediccion_sentimiento_redesSociales_df.prediccion))
'age', 'fnlwgt', 'capital-gain', 'educational-num', 'capital-loss', 'hours-per-week' ] assemblerInputs = [c + 'classVec' for c in CATE_FEATURES] + CONTI_FEATURES assemblerInputs # 4) assemble the steps assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features') stages += [assembler] # create a pipeline # TODO 다른 예제에서는 train/test 먼저 나누고 변환하기도 함(이게 맞는거같음 더 찾아보기) df_remove.show() pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df_remove) model = pipelineModel.transform(df_remove) model.take(1) ''' Out[116]: [Row(age=25, age_square=625.0, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', label='<=50K', workclassIndex=0.0, workclassclassVec=SparseVector(8, {0: 1.0}), educationIndex=5.0, educationclassVec=SparseVector(15, {5: 1.0}), marital-statusIndex=1.0,
interactor.fit(df_train).transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [interactor, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(df_train) predictions = model.transform(df_test) predictions.cache() predictions.show() from pyspark.ml.evaluation import BinaryClassificationEvaluator ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC") print(ev.evaluate(predictions)) spark.stop()
# Create a DecisionTreeRegressor dt = DecisionTreeRegressor(maxDepth = 8) dt.setLabelCol("TOTAL_BENEFICIARY_AMT")\ .setPredictionCol("Predicted_EXP")\ .setFeaturesCol("features")\ .setMaxBins(10000) # Create a Pipeline dtPipeline = Pipeline() # Set the stages of the Pipeline dtPipeline.setStages([vectorizer, dt]) model = dtPipeline.fit(train_data) train_data_output=model.transform(train_data) from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2") # Run the evaluator on the DataFrame r2 = regEval.evaluate(train_data_output) print("Root Mean Squared Error: %.2f" % r2) from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error
def multiclass(algorithms, dataset): # create label indexer = StringIndexer(inputCol='genre', outputCol='label') #https://spark.apache.org/docs/2.1.0/ml-features.html|https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe # set up pipeline pipeline = Pipeline(stages=[indexer]) df = pipeline.fit(dataset).transform(dataset) # split training / test training, test = df.randomSplit([0.7, 0.3]) # get class distribution class_dist = ( df .groupBy(['genre', 'label']) .count() .withColumn('fraction', F.when(F.col('count') < 5000, 1).otherwise(5000 / F.col('count'))) .orderBy('label') ) # create dictionary of fraction of each class fractions = dict() for row in class_dist.collect(): fractions[row.label] = row.fraction # down-sampling audio features using fractions training_bal = training.sampleBy('label', fractions, seed=1) # print head str = '{:>10}|{:>10}|{:>10}|{:>10}|{}' print(str.format('accuracy', 'precision', 'recall', 'f1', 'algorithm')) # iterate algorithms for name, normal, cv in algorithms: # train model_nor = normal.fit(training) model_bal = normal.fit(training_bal) model_ovr = OneVsRest(classifier=normal).fit(training) # predict predict_nor = model_nor.transform(test) predict_bal = model_bal.transform(test) predict_ovr = model_ovr.transform(test) metrics = [('normal', predict_nor), ('balance', predict_bal), ('one-vs-rest', predict_ovr)] # cv if cv != None: model_cv = cv.fit(training) predict_cv = model_cv.transform(test) metrics.append(('cv', predict_cv)) # metrics for mtype, d in metrics: eval_multi = MulticlassClassificationEvaluator() accuracy = eval_multi.evaluate(d, {eval_multi.metricName: 'accuracy'}) precision = eval_multi.evaluate(d, {eval_multi.metricName: 'weightedPrecision'}) recall = eval_multi.evaluate(d, {eval_multi.metricName: 'weightedRecall'}) f1 = eval_multi.evaluate(d, {eval_multi.metricName: 'f1'}) str = '{:>10.3f}|{:>10.3f}|{:>10.3f}|{:>10.3f}|{}' print(str.format(accuracy, precision, recall, f1, name + ' (' + mtype +')'))
"VectorC1", "Vector_banner_pc", "Vector_site_category", "Vector_app_category", "Vector_device_type", "Vector_device_conn_type", "VectorC15", "VectorC16", "VectorC18", "VectorC19", "VectorC21" ], outputCol="VectoredFeatures") # Using pipeline pipelineTmp = Pipeline(stages=[ C1Indexer, BannerPcIndexer, SiteCategoryIndexer, AppCategoryIndexer, DeviceTypeIndexer, DeviceConnTypeIndexer, C15Indexer, C16Indexer, C18Indexer, C19Indexer, C21Indexer, C1Encoder, BannerPcEncoder, SiteCategoryEncoder, AppCategoryEncoder, DeviceTypeEncoder, DeviceConnTypeEncoder, C15Encoder, C16Encoder, C18Encoder, C19Encoder, C21Encoder, FeatureAssembler ]) modelTmp = pipelineTmp.fit(schemaClick) tmp = modelTmp.transform(schemaClick).select("click", "VectoredFeatures") tmp.registerTempTable("CLICK") # Selecting click and VectoredFeatures from Table "CLICK" and creating new dataFrame as results results = sqlContext.sql("SELECT click, VectoredFeatures from CLICK") results.show() # Creating label points for attributes click and VectoredFeatures click_transformed = results.select( 'click', 'VectoredFeatures').rdd.map(lambda row: LabeledPoint( float(row.click), Vectors.dense((row.VectoredFeatures).toArray()))) click_transformed.take(2) #Divide the data into training and test sets weights = [.8, .2]
class_indexer = StringIndexer(inputCol="C4", outputCol="label") # Read in data for sensitivity analysis test_data = sql_context.read.load('tests/resources/iris_test_data.csv', format='com.databricks.spark.csv', header='false', inferSchema='true') # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="features", labelCol="label") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[assembler, class_indexer, dt]) # Train model. This also runs the indexer. model = pipeline.fit(data) # Get our data_info frame, courtesy of PSAML cols_to_analyze = ['C0', 'C1', 'C2', 'C3'] data_info = psaml.make_data_info(sql_context, test_data, cols_to_analyze, 'C4') # Make predictions. predictions = psaml.do_continuous_input_analysis(sc, model, 5, 5, data_info) # Select example rows to display. # predictions.show() # opt param: number of records to show fig = plotly.tools.make_subplots(rows=len(cols_to_analyze), cols=1) sql_context.registerDataFrameAsTable(predictions, "predictions")
# define stage 1: tokenize the tweet text stage_1 = RegexTokenizer(inputCol='tweet', outputCol='tokens', pattern='\\W') # define stage 2: remove the stop words stage_2 = StopWordsRemover(inputCol='tokens', outputCol='filtered_words') # define stage 3: create a word vector of the size 100 stage_3 = Word2Vec(inputCol='filtered_words', outputCol='vector', vectorSize=100) # define stage 4: Logistic Regression Model model = LogisticRegression(featuresCol='vector', labelCol='positive') # setup the pipeline pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model]) # fit the pipeline model with the training data pipelineFit = pipeline.fit(training_set) modelSummary = pipelineFit.stages[-1].summary modelSummary.accuracy def get_prediction_json(key, rdd): print("********************") tweet = rdd.map(lambda (key, value): json.loads(value)).map( lambda json_object: json_object["text"]) tweetstr = tweet.collect() if not tweetstr: print("No Tweet") return print("********************")
# convert features into vector assembler = ( VectorAssembler() .setInputCols([x for x in features.columns if x.startswith('method')]) .setOutputCol('features') ) # normalize vector scaler = StandardScaler(inputCol='features', outputCol='scfeatures', withStd=True, withMean=False) # set up pipeline pipeline = Pipeline(stages=[assembler, scaler]) dataset = pipeline.fit(features).transform(features) #model = Pipeline(stages=[assembler,scaler]).fit(features).transform(features) # print table dataset.select(['track_id', 'genre', 'features', 'scfeatures']).show(3, 30) # +------------------+--------------+------------------------------+------------------------------+ # | track_id| genre| features| scfeatures| # +------------------+--------------+------------------------------+------------------------------+ # |TRAAABD128F429CF47| Pop_Rock|[0.1308,9.587,459.9,27280.0...|[2.022118802771498,2.624321...| # |TRAAADT12903CCC339|Easy_Listening|[0.08392,7.541,423.7,36060....|[1.2973716355396339,2.06425...| # |TRAAAEF128F4273421| Pop_Rock|[0.1199,9.381,474.5,26990.0...|[1.8536089025405398,2.56793...| # +------------------+--------------+------------------------------+------------------------------+ # only showing top 3 rows
rdd = data.filter(lambda row: row != header) r = rdd.mapPartitions(lambda x: csv.reader(x)) r = r.map(lambda x: (processTweet(x[3]), int(x[1]))) r = r.map(lambda x: Row(sentence=x[0], label=int(x[1]))) df = spark.createDataFrame(r).orderBy(rand()).limit(500000) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="base_words") hashingTF = HashingTF(numFeatures=10000, inputCol="base_words", outputCol="features") lr = LogisticRegression(maxIter=10000, regParam=0.001, elasticNetParam=0.0001) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) splits = df.randomSplit([0.6, 0.4], 223) trainSet = splits[0] testSet = splits[1] lrModel = pipeline.fit(trainSet) lrResult = lrModel.transform(testSet) testSet.show(truncate=False) lrResult.show(truncate=False) avg = lrResult.where('label == prediction').count() / lrResult.count() print(avg)
def main(dict): filename = dict['filename'] savedmodelName = dict['modelname'] def myFunc(input): lines = input.split("\n") for line in lines: parts = line.split(";") Category = parts[-1] Sentence = parts[1] url_pattern = re.compile(r'(http[s]://[\w./]+)*') rt_pattern = re.compile('RT @\w+: ') r_pattern = re.compile('@\w+ ') Sentence = r_pattern.sub( r'', rt_pattern.sub(r'', url_pattern.sub(r'', Sentence))).replace( '\n', ' ').strip() return (Category, Sentence) file = sc.textFile("4CVTweets/" + filename) lines = file.map(myFunc) sentenceDataFrame = spark.createDataFrame(lines, ["label", "sentence"]) (trainingData, testData) = sentenceDataFrame.randomSplit([0.7, 0.3]) df = spark.createDataFrame([(0, "NO"), (1, "crash"), (2, "fire"), (3, "shooting")], ["id", "label"]) # start building the pineline # No: 0,Crash:1,Fire:2,Shooting:3 indexer = StringIndexer(inputCol="label", outputCol="categoryIndex") indexer.fit(df) tokenizer = RegexTokenizer(pattern="\\w+", inputCol="sentence", outputCol="words", gaps=False) remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) # # Compute the Inverse Document Frequency (IDF) given a collection of documents. rf = RandomForestClassifier(labelCol="categoryIndex", featuresCol="features", numTrees=100, maxDepth=10) # Using randomForest # mlr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol="features",labelCol="categoryIndex") # Naive Bayers nb = NaiveBayes(labelCol="categoryIndex", featuresCol="features", smoothing=1) # converter = IndexToString(inputCol="prediction", outputCol="originalCategory") pipeline = Pipeline( stages=[indexer, tokenizer, remover, hashingTF, idf, nb]) model = pipeline.fit(trainingData) # Start to count accuracy to evaluate the model using just the offline model predictionsForTraining = model.transform(trainingData) predictionsForTraining.show(100, False) joindf = spark.createDataFrame([(0.0, "NO"), (1.0, "crash"), (2.0, "fire"), (3.0, "shooting")], ["prediction", "Predictlabel"]) innerjoin = predictionsForTraining.join( joindf, joindf.prediction == predictionsForTraining.prediction).drop( joindf.prediction) # innerjoin.select("label","categoryIndex","prediction","Predictlabel").show(1000,False) innerjoin.select("label", "Predictlabel").show(1000, False) evaluator1 = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy") accuracy = evaluator1.evaluate(predictionsForTraining) print("Test Accuracy = %g " % (accuracy)) print("Train Error = %g " % (1.0 - accuracy)) predictions = model.transform(testData) evaluator2 = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy") accuracy = evaluator2.evaluate(predictions) print("Test Accuracy = %g " % (accuracy)) print("Test Error = %g " % (1.0 - accuracy)) savePath = "tmp/pipeline/" + savedmodelName model.write().overwrite().save(savePath) print("model for Location", savedmodelName, "save successfully.")
# Q2 #(a) from pyspark.ml.recommendation import ALS from pyspark.ml.feature import StringIndexer from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml import Pipeline user_stringIdx = StringIndexer(inputCol="USER_ID", outputCol="USER_INDEX") song_stringIdx = StringIndexer(inputCol="SONG_ID", outputCol="SONG_INDEX") pipeline1 = Pipeline(stages=[user_stringIdx, song_stringIdx]) # Fit the pipeline to training pipelineFit = pipeline1.fit(user_song) dataset1 = pipelineFit.transform(user_song) trainingData1, testData1 = split_Data(dataset1, 10) als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, userCol="USER_INDEX", itemCol="SONG_INDEX", ratingCol="COUNT") ALSModel = als.fit(trainingData1) # Generate top 10 recommendations for 10 SELECTED users users = testData1.select(["USER_INDEX"]).distinct().limit(5)
REDDIT_AUG = "swift://reddit3.sjc01/RC_2015-08" REDDIT_SEPT = "swift://reddit3.sjc01/RC_2015-09" if __name__ == "__main__": # Configure Spark sc = SparkContext(appName=APP_NAME) sqlContext = SQLContext(sc) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="body", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # prepare Reddit json files as sql Dataframes for pyspark.ml aug_comments = sqlContext.read.json(REDDIT_AUG) sep_comments = sqlContext.read.json(REDDIT_SEPT) # transform data for log_reg model by dividing karma score by 1000 # error: Classification labels should be in {0 to 8114} Found 2576839 invalid labels. training = aug_comments.select('id', 'body', (aug_comments.score / 1000.0).cast("double").alias('label')) test = sep_comments.select('id', 'body') test_actual = sep_comments.select('id', (sep_comments.score / 1000.0).alias('actual')) model = pipeline.fit(training) prediction = model.transform(test) selected = prediction.select("id", "text", "prediction").join(test_actual, prediction.id == test_actual.id) selected.write.format('json').save("hdfs://master/usr/hadoop/karma_predictions") sc.stop()
multiclass_dataset = multiclass_dataset.drop("genre_id") df = multiclass_dataset cols = df.columns stages = [] label_stringIdx = StringIndexer(inputCol='int_genre_id', outputCol='label') stages += [label_stringIdx] numericCols = multiclass_dataset.schema.names[0:-1] assembler = VectorAssembler(inputCols=numericCols, outputCol="features") stages += [assembler] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) selectedCols = ['label', 'features'] + cols # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # Train a Random Forest model. rf = RandomForestClassifier(labelCol="categoryIndex", featuresCol="features", numTrees=100, maxDepth=10, impurity="entropy") rf = OneVsRest(classifier=rf) # Chain RF in a Pipeline