def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def train_lg(training_data, collection): # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr. hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") pipeline1 = Pipeline(stages=[hashingTF, idf]) # Fit the pipeline1 to training documents. model1 = pipeline1.fit(training_data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline2 = Pipeline(stages=[model1, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training_data) # model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_' # + collection["Id"] + '_' # + collection["name"]) # cvModel.save(sc, model_path) return cvModel
def main(sc, spark): # Load the Corpus corpus = load_corpus(sc, spark) # Create the vector/cluster pipeline pipeline = Pipeline(stages=[ Tokenizer(inputCol="text", outputCol="tokens"), Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"), BisectingKMeans(k=10, featuresCol="vecs", maxIter=10), ]) # Fit the model model = pipeline.fit(corpus) corpus = model.transform(corpus) # Evaluate clustering. bkm = model.stages[-1] cost = bkm.computeCost(corpus) sizes = bkm.summary.clusterSizes # TODO: compute cost of each cluster individually # Get the text representation of each cluster. wvec = model.stages[-2] table = [["Cluster", "Size", "Terms"]] for ci, c in enumerate(bkm.clusterCenters()): ct = wvec.findSynonyms(c, 7) size = sizes[ci] terms = " ".join([row.word for row in ct.take(7)]) table.append([ci, size, terms]) # Print Results print(tabulate(table)) print("Sum of square distance to center: {:0.3f}".format(cost))
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def testLogisticMLPipeline1(self): training = sqlCtx.createDataFrame([ ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), ("hadoop mapreduce", 2.0), ("b spark who", 1.0), ("g d a y", 2.0), ("spark fly", 1.0), ("was mapreduce", 2.0), ("e spark program", 1.0), ("a e c l", 2.0), ("spark compile", 1.0), ("hadoop software", 2.0) ], ["text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) lr = LogisticRegression(sqlCtx) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = sqlCtx.createDataFrame([ ("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), ("apache hadoop", 2.0)], ["text", "label"]) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator() score = evaluator.evaluate(predictionAndLabels) self.failUnless(score == 1.0)
def test_nnclassifier_in_pipeline(self): if self.sc.version.startswith("1"): from pyspark.mllib.linalg import Vectors df = self.sqlContext.createDataFrame( [(Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), (Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), ], ["features", "label"]) scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled") model = Sequential().add(Linear(2, 2)) criterion = ClassNLLCriterion() classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\ .setBatchSize(4) \ .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled") pipeline = Pipeline(stages=[scaler, classifier]) pipelineModel = pipeline.fit(df) res = pipelineModel.transform(df) assert type(res).__name__ == 'DataFrame'
def model(classifiers, training, testing, week): results = "" timing = [] for classifier in classifiers: timeStart = time.time() clf = get_classifier(classifier) labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(training) prediction = model.transform(testing) metrics = BinaryClassificationMetrics(prediction.select("label","prediction").rdd) results = results + "new," + classifier + "," + week + "," + str(metrics.areaUnderROC) + "," +str(metrics.areaUnderPR) + "\n" timing.append(time.time()-timeStart) return results, timing
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def main(input_file): # Load and parse the data file, converting it to a DataFrame. data = MLUtils.loadLabeledPoints(sc, input_file) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse)) rfModel = model.stages[1] print(rfModel) # summary only
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred): lp_data= get_labeled_points(start1, end2, df, sc, sql_context) print lp_data.count() labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data) td = labelIndexer.transform(lp_data) label2index = {} for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]), key=lambda x: x[0]): label2index[int(each[0])] = int(each[1]) print label2index featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data) rf = get_model() pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1) model = pipeline.fit(lp_train) lp_check = lp_data.filter(lp_data.date2>start2) predictions = model.transform(lp_check) predictions = val(predictions, label2index, sql_context) if is_pred: predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc()) dfToTableWithPar(sql_context, predictions, "predictions", get_cur()) for each in predictions.take(10): print each
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def model(classifier, ftrain, fvalid, fprediction): startTime = time.time() ctx = SparkContext(appName="model_on_Spark") sqlContext = SQLContext(ctx) logger = SparkLogger(ctx) logger.set_level('ERROR') # load and prepare training and validation data rawTrain, train = prepData(sqlContext, ctx, ftrain) rawValid, valid = prepData(sqlContext, ctx, fvalid) # is needed to join columns valid = indexData(valid) rawValid = indexData(rawValid) classifiers = { "RandomForestClassifier" : RFC } clf = classifiers[classifier]() labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # train and predict pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(train) predictions = model.transform(valid) # write to file: subsetPrediction = predictions.select("prediction", "index") subsetValidData = rawValid.select("dataset", "index") output = (subsetValidData .join(subsetPrediction, subsetPrediction.index == subsetValidData.index) .drop("index") .drop("index")) lines = output.map(toCSVLine) lines.saveAsTextFile('output') evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Test Error = %g" % (1.0 - accuracy) executionTime = time.time() - startTime row=classifier+','+str(executionTime) ctx.parallelize([row]).saveAsTextFile("timing")
def event_pipeline(dataset): """ """ EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI") EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI") EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI") assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeI", "EventBaseCodeI","EventRootCodeI", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features") featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=310) pipeline = Pipeline(stages=[EventCodeI, EventBaseCodeI, EventRootCodeI,assembler,featureIndexer]) model = pipeline.fit(dataset) output = model.transform(dataset) data = output.map(lambda row: LabeledPoint(row[0], row[-1])).cache() print "Data:" print data.take(1) return data
def group(self): reTokenizer = RegexTokenizer(inputCol=self.query_colname, outputCol="words", minTokenLength=2) #, pattern='\W' hashingTF = HashingTF(numFeatures=self.num_features, inputCol="words", outputCol="tf") if self.idf == True: idf = IDF(minDocFreq=self.min_doc_freq, inputCol="tf", outputCol="idf") kmeans = KMeans(featuresCol="idf", predictionCol="cluster_id", k=self.n) pipeline = Pipeline(stages=[reTokenizer, hashingTF, idf, kmeans]) else: kmeans = KMeans(featuresCol="tf", predictionCol="cluster_id", k=self.n) pipeline = Pipeline(stages=[reTokenizer, hashingTF, kmeans]) model = pipeline.fit(self.df) prediction = model.transform(self.df) return prediction
def getPipeline(self, df): # notify pipeline self.success('Initializing ML Pipeline ...') # initialize our tokenizer, we're going to tokenize features tokenizer = Tokenizer(inputCol='tag_features', outputCol='words') # convert the tokenize data to vectorize data hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features') # initialize logistic regression algorithm lr = LogisticRegression(maxIter=10, regParam=0.01) # create / initialize the ml pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # fit the pipeline on our training dataframe model = pipeline.fit(df) return model
def event_pipeline(dataset): EventCodeI = StringIndexer(inputCol="EventCode", outputCol="EventCodeI") EventCodeV = OneHotEncoder(dropLast=True, inputCol="EventCodeI", outputCol="EventCodeV") EventRootCodeI = StringIndexer(inputCol="EventRootCode", outputCol="EventRootCodeI") EventRootCodeV = OneHotEncoder(dropLast=True, inputCol="EventRootCodeI", outputCol="EventRootCodeV") EventBaseCodeI = StringIndexer(inputCol="EventBaseCode", outputCol="EventBaseCodeI") EventBaseCodeV = OneHotEncoder(dropLast=True, inputCol="EventBaseCodeI", outputCol="EventBaseCodeV") assembler = VectorAssembler(inputCols=["IsRootEvent", "EventCodeV", "EventBaseCodeV","EventRootCodeV", "QuadClass","GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone"], outputCol="features") pipeline = Pipeline(stages=[EventCodeI, EventCodeV, EventRootCodeI, EventRootCodeV,EventBaseCodeI,EventBaseCodeV,assembler]) model = pipeline.fit(dataset) output = model.transform(dataset) data = output.map(lambda row: LabeledPoint(row[0], row[-1])).toDF().cache() return data
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def build_decision_tree(sqlContext, features, interested): print '-----------------------------------------' data = sqlContext.createDataFrame( [Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))]) data.printSchema() data.show(5) print 'created data frame' # Index the label column & adding metadata. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) print 'created label indexer' # Mark the features with < 4 distinct values as categorical featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.8, 0.2]) # Train a DecisionTree model dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) # Chain the indexers together with DecisionTree pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train the model model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) & compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") precision = evaluator.evaluate(predictions) treeModel = model.stages[2] return (1 - precision, model)
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["filePath"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) model0, transformer1, model2, transformer3 = pipeline_model.stages self.assertEqual(0, model0.dataset_index) self.assertEqual(0, model0.getFake()) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.getFake()) self.assertEqual(2, dataset.index) self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.") self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def build_ngrams_wocs(inputCol=["Text","Sentiment"], n=3): tokenizer = [Tokenizer(inputCol="Text", outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)] assembler = [VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="features" )] label_stringIdx = [StringIndexer(inputCol = "Sentiment", outputCol = "label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr) pipeline = Pipeline(stages=[tokenizer, ngrams, cv, idf,assembler, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df)
# We freeze layers from input to pool4/3x3_s2 inclusive. model.freeze_up_to(["pool4/3x3_s2"]) # ### Add a few new layers inputNode = Input(name="input", shape=(3, 224, 224)) inception = model.to_keras()(inputNode) flatten = Flatten()(inception) logits = Dense(2)(flatten) lrModel = Model(inputNode, logits) classifier = NNClassifier( lrModel, CrossEntropyCriterion(), transformer).setLearningRate(0.003).setBatchSize(64).setMaxEpoch( 1).setFeaturesCol("image").setCachingSample(False) pipeline = Pipeline(stages=[classifier]) # # Train the model # The transfer learning can finish in a few minutes. catdogModel = pipeline.fit(trainingDF) predictionDF = catdogModel.transform(validationDF).cache() predictionDF.select("name", "label", "prediction").sort("label", ascending=False).show(10) predictionDF.select("name", "label", "prediction").show(10) correct = predictionDF.filter("label=prediction").count() overall = predictionDF.count() accuracy = correct * 1.0 / overall print("Test Error = %g " % (1.0 - accuracy))
assembler = VectorAssembler( inputCols = input_cols, outputCol = 'features') from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(featuresCol = 'features', labelCol = 'CANCELLED', maxIter=100) pipeline = Pipeline(stages=[op_carrier_indexer, op_carrier_encoder, origin_indexer, origin_encoder, dest_indexer, dest_encoder, crs_dep_hour_indexer, crs_dep_hour_encoder, assembler, lr]) (train, test) = flight_df.randomSplit([0.7, 0.3]) lrModel = pipeline.fit(train) from pyspark.ml.evaluation import BinaryClassificationEvaluator predictionslr = lrModel.transform(test) evaluator = BinaryClassificationEvaluator(labelCol="CANCELLED",metricName="areaUnderROC") evaluator.evaluate(predictionslr)
# OneHot encode type onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy']) # Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy' assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy'], outputCol='features') # Split the data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) # Fit a Logistic Regression model to the training data regression = LinearRegression(labelCol='consumption') # Combine steps into a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # object to evaluate performance evaluator = RegressionEvaluator(labelCol='consumption') # build grid of parameter values (now empty) params = ParamGridBuilder().build() # create cross-validation object cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=10, seed=13) # run fit on training data
#fulldata=fulldata.select(['product_uid','id','tf_idf_plus','tf_idfs_plus','relevance']) #COMPUTE COSINE # create NEW features & train and evaluate regression model # Step 1: create features fulldata = fulldata.withColumnRenamed('relevance', 'label').select(['label', 'features']) # Simple evaluation : train and test split (train, test) = fulldata.rdd.randomSplit([0.8, 0.2]) #Initialize regresion model #lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit( sqlContext.createDataFrame(train)) rf = RandomForestRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, rf]) # Fit the model #lrModel = lr.fit(sqlContext.createDataFrame(train)) lrModel = pipeline.fit(sqlContext.createDataFrame(train)) # Apply model to test data result = lrModel.transform(sqlContext.createDataFrame(test)) # Compute mean squared error metric MSE = result.rdd.map(lambda r: (r['label'] - r['prediction'])**2).mean() print("Mean Squared Error = " + str(MSE))
# prepare labeled sets cols_now = ['prod_price', 'prod_feat_1', 'prod_feat_2', 'cust_age', 'prod_feat_3_reduced_catVec', 'cust_region_catVec', 'prod_type_catVec', 'cust_sex_catVec', 'cust_title_catVec'] assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features') labelIndexer = StringIndexer(inputCol='binary_response', outputCol="label") tmp += [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) # prepare a pipline allData = pipeline.fit(df).transform(df) allData.cache() trainingData, testData = allData.randomSplit([0.8,0.2], seed=0) # need to ensure same split for each time print("Distribution of Pos and Neg in trainingData is: ", trainingData.groupBy("label").count().take(3)) # prediction and evaluation data from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric results = transformed.select(['probability', 'label'])
label_stringIdx = StringIndexer(inputCol = 'CRASH_FLAG', outputCol = 'label') stages += [label_stringIdx] assemblerInputs = [c + "classVec" for c in Categoric_features] + numeric_features assembler = VectorAssembler()\ .setInputCols(assemblerInputs) \ .setOutputCol("vec_features") stages += [assembler] scaler = StandardScaler()\ .setInputCol("vec_features") \ .setOutputCol("features") stages += [scaler] pipeline = Pipeline(stages = stages) pipelineModel = pipeline.fit(new_df) testdf=pipelineModel.transform(new_df) pipelineModel.write().overwrite().save(PipelineLoc) #split the rows into 70% training and 30% testing sets splits=testdf.randomSplit([0.7, 0.3], 2018) train_df=splits[0] test_df=splits[1] #use Binomial Logistic regression to predict "CRASH_FLAG" lr = LogisticRegression(featuresCol= 'features', labelCol='label', maxIter=10)
def graph(t1, t2, time_min, time_sec, fb, fi, ft, fd, t1tk, t2tk, t1ik, t2ik, first_dragon, first_rift_herald): copyfile('lib_full.txt', 'lib3.txt') for k in range(0, 10): for j in range(0, 10): line = [ 100 * time_min + time_sec, 9, fb, ft, fi, fd, first_dragon, first_rift_herald, t1[0], t1[1], t1[2], t1[3], t1[4], k, t1ik, t2[0], t2[1], t2[2], t2[3], t2[4], j, t2ik ] test_line = [2] for i in range(len(line)): new_item = "%s:%s" % (i + 1, line[i]) test_line.append(new_item) print(test_line) with open('lib3.txt', 'a') as f: for item in test_line: f.write("%s " % item) f.write("\n") f.close() data = spark.read.format("libsvm").option("numFeatures", "22").load("lib3.txt") (trainingData, testData) = split_by_row_index(data) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=32).fit(data) gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show(100, False) result_list = predictions.collect() # try: # os.remove("lib3.txt") # except: # print "file not existed" x1 = [] y1 = [] x2 = [] y2 = [] result1 = [] result2 = [] #13 , 20 for i in range(0, 10): for j in range(0, 10): if result_list[len(result_list) - 1 - 99 + i * 10 + j]['prediction'] == 0: result1.append('Team 1 Win') x1.append(i) y1.append(j) else: result2.append('Team 2 Win') x2.append(i) y2.append(j) plt = dcc.Graph(id='life-exp-vs-gdp', figure={ 'data': [ go.Scatter( x=x1, y=y1, text=result1, mode='markers', opacity=0.7, marker={ 'size': 15, 'line': { 'width': 0.5, 'color': 'white' } }, ), go.Scatter( x=x2, y=y2, text=result2, mode='markers', opacity=0.7, marker={ 'size': 15, 'line': { 'width': 0.5, 'color': 'blue' } }, ) ], 'layout': go.Layout(xaxis={ 'type': 'log', 'title': 'Team 1 tower kill' }, yaxis={'title': 'Team 2 tower kill'}, margin={ 'l': 40, 'b': 40, 't': 10, 'r': 10 }, legend={ 'x': 0, 'y': 1 }, hovermode='closest') }) return plt
def build_indep_vars(df, independent_vars, categorical_vars=None, keep_intermediate=False, summarizer=True): """ Data verification df : DataFrame independent_vars : List of column names categorical_vars : None or list of column names, e.g. ['col1', 'col2'] """ assert ( type(df) is pyspark.sql.dataframe.DataFrame ), 'pypark_glm: A pySpark dataframe is required as the first argument.' assert ( type(independent_vars) is list ), 'pyspark_glm: List of independent variable column names must be the third argument.' for iv in independent_vars: assert ( type(iv) is str ), 'pyspark_glm: Independent variables must be column name strings.' assert ( iv in df.columns ), 'pyspark_glm: Independent variable name is not a dataframe column.' if categorical_vars: for cv in categorical_vars: assert ( type(cv) is str ), 'pyspark_glm: Categorical variables must be column name strings.' assert ( cv in df.columns ), 'pyspark_glm: Categorical variable name is not a dataframe column.' assert ( cv in independent_vars ), 'pyspark_glm: Categorical variables must be independent variables.' """ Code """ from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml.regression import GeneralizedLinearRegression if categorical_vars: string_indexer = [ StringIndexer(inputCol=x, outputCol='{}_index'.format(x)) for x in categorical_vars ] encoder = [ OneHotEncoder(dropLast=True, inputCol='{}_index'.format(x), outputCol='{}_vector'.format(x)) for x in categorical_vars ] independent_vars = [ '{}_vector'.format(x) if x in categorical_vars else x for x in independent_vars ] else: string_indexer, encoder = [], [] assembler = VectorAssembler(inputCols=independent_vars, outputCol='indep_vars') pipeline = Pipeline(stages=string_indexer + encoder + [assembler]) model = pipeline.fit(df) df = model.transform(df) #for building the crosswalk between indicies and column names if summarizer: param_crosswalk = {} i = 0 for x in independent_vars: if '_vector' in x[-7:]: xrs = x.rstrip('_vector') dst = df[[xrs, '{}_index'.format(xrs)]].distinct().collect() for row in dst: param_crosswalk[int(row['{}_index'.format(xrs)] + i)] = row[xrs] maxind = max(param_crosswalk.keys()) del param_crosswalk[maxind] #for droplast i += len(dst) elif '_index' in x[:-6]: pass else: param_crosswalk[i] = x i += 1 """ {0: 'carat', 1: u'SI1', 2: u'VS2', 3: u'SI2', 4: u'VS1', 5: u'VVS2', 6: u'VVS1', 7: u'IF'} """ make_summary = Summarizer(param_crosswalk) if not keep_intermediate: fcols = [ c for c in df.columns if '_index' not in c[-6:] and '_vector' not in c[-7:] ] df = df[fcols] if summarizer: return df, make_summary else: return df
).toDF() df_new = sc.parallelize( [ Row(p=u'p1', owner=u'u1', f1=0.1, f2=0.3, f3=0.5), Row(p=u'p2', owner=u'u1', f1=0.3, f2=0.5, f3=0.5), Row(p=u'p3', owner=u'u1', f1=0.6, f2=0.6, f3=0.9), Row(p=u'p4', owner=u'u1', f1=0.8, f2=0.1, f3=0.6), Row(p=u'p5', owner=u'u1', f1=0.0, f2=0.2, f3=0.2), Row(p=u'p1', owner=u'u2', f1=0.0, f2=0.4, f3=0.1), Row(p=u'p2', owner=u'u2', f1=0.3, f2=0.7, f3=0.4), Row(p=u'p3', owner=u'u2', f1=0.4, f2=0.6, f3=0.6), Row(p=u'p4', owner=u'u2', f1=0.6, f2=0.1, f3=0.7), Row(p=u'p5', owner=u'u2', f1=0.0, f2=0.0, f3=0.8), ] ).toDF() owner_training = df_training.where(col('owner') == 'u1') owner_new = df_new.where(col('owner') == 'u1') label_indexer = StringIndexer(inputCol="status", outputCol="indexedStatus") assembler = VectorAssembler(inputCols=['f1', 'f2', 'f3'], outputCol='features') rf = RandomForestClassifier(labelCol="indexedStatus", featuresCol="features") pipeline = Pipeline(stages=[label_indexer, assembler, rf]) model = pipeline.fit(owner_training) predictions = model.transform(owner_new) predictions.show()
.getOrCreate() # Prepare training documents from a list of (id, text, label) tuples, where label ham=0/spam=1 training = spark.createDataFrame([ (0, "Meetup Spark user group Dublin", 0.0), (1, "Quick Loans availuble!", 1.0), (2, "New: The 20 pounds-per-day diet. Must try.", 1.0), (3, "hadoop mapreduce", 0.0), (4, "GET YOUR UUNIVERSITY DEGREE IN DATA ANALYSTICS. IN JUST 1 DAY", 1.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame( [(5, "I am not a spam, I promise!!!"), (6, "Spark can work on top of hadoop or standalone"), (7, "New release available for Spark on DataSets")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): rid, text, prob, prediction = row
print('Building Pipeline.\n') regexTokenizer = RegexTokenizer(inputCol='review', outputCol='tokenized', pattern="\\W") stopwordsRemover = StopWordsRemover(inputCol='tokenized', outputCol='removed_sw').setStopWords(sw) countVectorizer = CountVectorizer(inputCol="removed_sw", outputCol="features", vocabSize=10000, minDF=5) lr = LogisticRegression(featuresCol='features', labelCol='label') pipeline = Pipeline( stages=[regexTokenizer, stopwordsRemover, countVectorizer, lr]) #Split data into training and testing partitions print('Splitting data.\n') train_data, test_data = data.randomSplit([0.9, 0.1]) #Fit and transform training data print('Fitting training data to pipeline.\n') pipelineFit = pipeline.fit(train_data) transformed_train_data = pipelineFit.transform(train_data) #Predict on test data print('Predicting on testing data.\n') test_predictions = pipelineFit.transform(test_data) #Calculate Metrics
# Splitting data into train test data and streaming data train_test_data, streaming_data = df_indexed.randomSplit([0.95, 0.05]) # SAVING STREAMING DATA streaming_data.write.save("OutputGStore\\" + unique_key + "-streaming-data.csv", format="csv", header="true") del streaming_data ## STEP 2: Prepare, train and validate the data print("STEP 2: Train and validate the model") feature_cols = train_test_data.columns feature_cols.remove('Installs indexed') assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features", handleInvalid = "error") pipeline = Pipeline(stages=[assembler]) outputModel = pipeline.fit(train_test_data) output = outputModel.transform(train_test_data) final_data = output.select("features", "Installs indexed") train_data, test_data = final_data.randomSplit([0.7, 0.3]) # CLASIFICATION CODE # Random forest classifier rf = RandomForestClassifier(labelCol="Installs indexed", featuresCol="features", numTrees=32, maxBins=120) model = rf.fit(train_data) predictions = model.transform(test_data) evaluator = MulticlassClassificationEvaluator(
.getOrCreate() # Prepare training documents from a list of (id, text, label) tuples. training = sparkSession.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and logistic regression. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") logistic_regression = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, logistic_regression]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = sparkSession.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): rid, text, prob, prediction = row print("(%d, %s) --> prob=%s, prediction=%f" %
# Change categorical values into numeric indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index") for column in catColumns ] encoder = OneHotEncoderEstimator( inputCols=[c + "_index" for c in catColumns], outputCols=[c + "_vector" for c in catColumns]) assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns, outputCol="features") label_stringIdx = StringIndexer(inputCol="income", outputCol="label") pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler]) encoded_df = pipeline.fit(df).transform(df) selectedCols = ['label', 'features'] + cols dataset = encoded_df.select(selectedCols) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # fit model and train lrModel = LogisticRegression().fit(encoded_df) lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData)
# COMMAND ---------- # Regularization Rates from pyspark.ml.classification import LogisticRegression # try a bunch of alpha values in a Linear Regression (Ridge) model reg = 0 print("Regularization rate: {}".format(reg)) # create a bunch of child runs #with root_run.child_run("reg-" + str(reg)) as run: # create a new Logistic Regression model. lr = LogisticRegression(regParam=reg) # put together the pipeline pipe = Pipeline(stages=[*si_xvars, *ohe_xvars, si_label, assembler, lr]) # train the model model_pipeline = pipe.fit(trainingData) # make prediction predictions = model_pipeline.transform(testData) # evaluate. note only 2 metrics are supported out of the box by Spark ML. bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions) au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions) truePositive = predictions.select("label").filter( "label = 1 and prediction = 1").count() falsePositive = predictions.select("label").filter( "label = 0 and prediction = 1").count()
# COMMAND ---------- from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler()\ .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\ .setOutputCol("features") # COMMAND ---------- from pyspark.ml import Pipeline transformationPipeline = Pipeline()\ .setStages([indexer, encoder, vectorAssembler]) # COMMAND ---------- fittedPipeline = transformationPipeline.fit(trainDataFrame) # COMMAND ---------- transformedTraining = fittedPipeline.transform(trainDataFrame) # COMMAND ---------- from pyspark.ml.clustering import KMeans
def train_model_sentences_with_person(): sentences_with_person_collection = get_db_collection_object( 'SentencesWithPerson') with open("sentences_with_person.txt", "w", encoding='utf-8') as file_sentences_with_person: for sen in sentences_with_person_collection.find(): file_sentences_with_person.write('{0}\n'.format(sen['sentence'])) spark = SparkSession \ .builder \ .appName("SentenceProcessor") \ .getOrCreate() input_data = spark.sparkContext.textFile('./sentences_with_person.txt') prepared_data = input_data.map(lambda x: (x, len(x))) prepared_data = prepared_data.filter(lambda x: x[1] > 0) prepared_df = prepared_data.toDF().selectExpr('_1 as sentence', '_2 as length') # prepared_df.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") words_data = tokenizer.transform(prepared_df) # words_data.show(truncate=False) # Отфильтровать токены, оставив только слова filtered_words_data = words_data.rdd.map( lambda x: (x[0], x[1], get_only_words(x[2]))) filtered_df = filtered_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words') # filtered_df.show() # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.) stop_words = stopwords.words('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(filtered_df) # normalize_words_data = filtered.rdd.map( lambda x: (x[0], x[1], x[2], normalization_sentence(x[3]))) normalized_df = normalize_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words', '_4 as normalize_words') # normalized_df.show() # vectorizer = CountVectorizer(inputCol='normalize_words', outputCol='raw_features').fit(normalized_df) featurized_data = vectorizer.transform(normalized_df) featurized_data.cache() # idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol='normalize_words', outputCol='result') doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec]) model = word2Vec.fit(rescaled_data) w2v_df = model.transform(rescaled_data) # w2v_df.show(truncate=False) # print(model.findSynonyms('бочаров', 2).show()) # sc = spark.sparkContext path = './models/model_person' # # print(sc, path) model.write().overwrite().save(path) #m = Word2Vec.load('./models/model_person/') # pickle.dump(model, './models/model_person/mp.model') spark.stop()
spark = SparkSession.builder.appName("Convolutional Neural Networks - Transfer Learning - Image Recognition").getOrCreate() # (3) Load the Plane and Bird images into Spark DataFrames and define a literal label column path_to_img_directory = '/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter07/data/image-recognition-data' birds_df = ImageSchema.readImages(path_to_img_directory + "/birds").withColumn("label", lit(0)) planes_df = ImageSchema.readImages(path_to_img_directory + "/planes").withColumn("label", lit(1)) # (4) Create Training and Test DataFrames respectively planes_train_df, planes_test_df = planes_df.randomSplit([0.75, 0.25], seed=12345) birds_train_df, birds_test_df = birds_df.randomSplit([0.75, 0.25], seed=12345) train_df = planes_train_df.unionAll(birds_train_df) test_df = planes_test_df.unionAll(birds_test_df) # (5) Transform the Images into Numeric Feature Vectors using Transfer Learning and the pre-trained InceptionV3 Convolutional Neural Network featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # (6) Train a Logistic Regression Model to classify our images logistic_regression = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") # (7) Execute the Featurizer and Logistic Regression estimator within a Pipeline to generate the Trained Model pipeline = Pipeline(stages=[featurizer, logistic_regression]) model = pipeline.fit(train_df) # (8) Apply the Trained Image Classification Model to the Test DataFrame to make predictions test_predictions_df = model.transform(test_df) test_predictions_df.select("image.origin", "prediction").show(truncate=False) # (9) Compute the accuracy of our Trained Image Classification Model accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(test_predictions_df.select("label", "prediction")))
#Read Source file and creates DataFrame ModalDF = sqlContext.read.csv(TrainSource,header="True",inferSchema="True").selectExpr("*",ConditionExpr) #Transforms Input columns into single ArrayList called features vectorizer = VectorAssembler() vectorizer.setInputCols(["Lat", "Long", "Ele","LocalTime"]) vectorizer.setOutputCol("features") #Declaring objects for Each Regressions lr0 = LogisticRegression(labelCol="Condition",predictionCol="Predicted_Cond",maxIter=100, regParam=0, family="multinomial") lr1 = LinearRegression(labelCol="Temp",predictionCol="Predicted_Temp",maxIter=100,regParam=0.1) lr2 = LinearRegression(labelCol="Pres",predictionCol="Predicted_Pres",maxIter=100,regParam=0.1) lr3 = LinearRegression(labelCol="Humid",predictionCol="Predicted_Humid",maxIter=100,regParam=0.1) #Combining all the Regression in a pipeline and fit the Dataset to create a Modal lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr1, lr2,lr3,lr0]) lrModel = lrPipeline.fit(ModalDF) # COMMAND ---------- """ The Following code Take the Test Dataset and perform following actions - Gets GeoInformation (Latitude, Longitude,Elevation) - Gets Monthly Data&Timestamps for each record - Predict the Temperature, Pressure, Humidity & Condition using Pipeline Model - Change the Fomat and write it in a file """ # Extract and Transform TestRDD TestRDD = sc.textFile(TestSource).map(lambda line: list(get_geo_info(line))).flatMap(lambda line: list(get_datetime_info(line)))
def data_processing(df): ''' :param data: A PySpark dataframe :return: A preprocessed data that has been cleaned, indexed and assembled ''' df.createOrReplaceTempView("data") processed_data = spark.sql(""" select host_id, price, bathrooms, bedrooms, room_type, property_type, case when host_is_superhost = True then 1.0 else 0.0 end as host_is_superhost, accommodates, cancellation_policy, minimum_nights, maximum_nights, availability_30, availability_60, availability_90, availability_365, case when security_deposit is null then 0.0 else security_deposit end as security_deposit, case when number_of_reviews is null then 0.0 else number_of_reviews end as number_of_reviews, case when extra_people is null then 0.0 else extra_people end as extra_people, case when instant_bookable = True then 1.0 else 0.0 end as instant_bookable, case when cleaning_fee is null then 0.0 else cleaning_fee end as cleaning_fee, case when review_scores_rating is null then 0.0 else review_scores_rating end as review_scores_rating, case when review_scores_accuracy is null then 0.0 else review_scores_accuracy end as review_scores_accuracy, case when review_scores_cleanliness is null then 0.0 else review_scores_cleanliness end as review_scores_cleanliness, case when review_scores_checkin is null then 0.0 else review_scores_checkin end as review_scores_checkin, case when review_scores_communication is null then 0.0 else review_scores_communication end as review_scores_communication, case when review_scores_location is null then 0.0 else review_scores_location end as review_scores_location, case when review_scores_value is null then 0.0 else review_scores_value end as review_scores_value, case when square_feet is not null and square_feet > 100 then square_feet when (square_feet is null or square_feet <=100) and (bedrooms is null or bedrooms = 0) then 350.0 else 380 * bedrooms end as square_feet, case when bathrooms >= 2 then 1.0 else 0.0 end as n_bathrooms_more_than_two, case when amenity_wifi = True then 1.0 else 0.0 end as amenity_wifi, case when amenity_heating = True then 1.0 else 0.0 end as amenity_heating, case when amenity_essentials = True then 1.0 else 0.0 end as amenity_essentials, case when amenity_kitchen = True then 1.0 else 0.0 end as amenity_kitchen, case when amenity_tv = True then 1.0 else 0.0 end as amenity_tv, case when amenity_smoke_detector = True then 1.0 else 0.0 end as amenity_smoke_detector, case when amenity_washer = True then 1.0 else 0.0 end as amenity_washer, case when amenity_hangers = True then 1.0 else 0.0 end as amenity_hangers, case when amenity_laptop_friendly_workspace = True then 1.0 else 0.0 end as amenity_laptop_friendly_workspace, case when amenity_iron = True then 1.0 else 0.0 end as amenity_iron, case when amenity_shampoo = True then 1.0 else 0.0 end as amenity_shampoo, case when amenity_hair_dryer = True then 1.0 else 0.0 end as amenity_hair_dryer, case when amenity_family_kid_friendly = True then 1.0 else 0.0 end as amenity_family_kid_friendly, case when amenity_dryer = True then 1.0 else 0.0 end as amenity_dryer, case when amenity_fire_extinguisher = True then 1.0 else 0.0 end as amenity_fire_extinguisher, case when amenity_hot_water = True then 1.0 else 0.0 end as amenity_hot_water, case when amenity_internet = True then 1.0 else 0.0 end as amenity_internet, case when amenity_cable_tv = True then 1.0 else 0.0 end as amenity_cable_tv, case when amenity_carbon_monoxide_detector = True then 1.0 else 0.0 end as amenity_carbon_monoxide_detector, case when amenity_first_aid_kit = True then 1.0 else 0.0 end as amenity_first_aid_kit, case when amenity_host_greets_you = True then 1.0 else 0.0 end as amenity_host_greets_you, case when amenity_translation_missing_en_hosting_amenity_50 = True then 1.0 else 0.0 end as amenity_translation_missing_en_hosting_amenity_50, case when amenity_private_entrance = True then 1.0 else 0.0 end as amenity_private_entrance, case when amenity_bed_linens = True then 1.0 else 0.0 end as amenity_bed_linens, case when amenity_refrigerator = True then 1.0 else 0.0 end as amenity_refrigerator from data where bedrooms is not null """) processed_data = processed_data.na.drop() cat_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, StringType) ] num_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, IntegerType) ] decimal_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, DecimalType) ] double_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, DoubleType) ] num_features = num_cols + decimal_cols + double_cols dataset_imputed = processed_data.persist() stages = [] for x in cat_cols: cats_indexer = StringIndexer(inputCol=x, outputCol=x + 'Index') encoder = OneHotEncoderEstimator( inputCols=[cats_indexer.getOutputCol()], outputCols=[x + "encode"]) stages += [cats_indexer, encoder] assembler_inputs = [c + "encode" for c in cat_cols] + num_features assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipeline_model = pipeline.fit(dataset_imputed) df = pipeline_model.transform(dataset_imputed) return df
credit.printSchema() featureCols = [ "balance", "duration", "history", "purpose", "amount", "savings", "employment", "instPercent", "sexMarried", "guarantors", "residenceDuration", "assets", "age", "concCredit", "apartment", "credits", "occupation", "dependents", "hasPhone", "foreign" ] lindexer = StringIndexer().setInputCol("creditability").setOutputCol("label") assembler = VectorAssembler().setInputCols(featureCols).setOutputCol( "features") pipeline = Pipeline().setStages([assembler, lindexer]) credit = pipeline.fit(credit).transform(credit) (training, test) = credit.randomSplit([0.7, 0.3], seed=1234) classifier = RandomForestClassifier().setImpurity("gini").setMaxDepth( 3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(1234) model = classifier.fit(training) predictions = model.transform(test) predictions.show() evaluator = BinaryClassificationEvaluator().setLabelCol("label")
labelCol="label") elif algo == "xgboost": ## Create H2OXGBoost model algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=[idf.getOutputCol()], labelCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model")
kmeans_df = sqlContext.read.format("com.databricks.spark.csv") \ .option("header", "false").option("delimiter"," ").option("inferschema", "true") \ .load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt") # Prepare data for training (see later the explanation about ML Pipelines) from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline assembler = VectorAssembler(inputCols=["_c0","_c1","_c2"], outputCol="features") assembler.transform(kmeans_df) # Create the KMeans model kmeans_estimator = KMeans().setFeaturesCol("features").setPredictionCol("prediction") # Pipeline stages definition pipeline = Pipeline(stages=[assembler, kmeans_estimator]) # Pipeline training model = pipeline.fit(kmeans_df) # Get the results: results = model.transform(kmeans_df) # Check results: display(results) # Without using Pipelines: # Clustering
# Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[assembler, labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "species", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
.csv(path_train) train.persist() print("Numero de casos en el train: %d" % train.count()) ignore_c = ['MachineIdentifier', 'HasDetections'] train_cols = [c for c in train.columns if c not in ignore_c] # Convertimos el TRAIN en un VECTOR para poder pasarle el RF print('Conversion de datos a VectorAssembler') assembler_features = VectorAssembler(inputCols=train_cols, outputCol='features') train_2 = train.limit(10000) train_data = assembler_features.transform(train_2) train_data = train_data.select('features', 'HasDetections')\ .withColumnRenamed('HasDetections', 'label') xgboost = XGBoostEstimator(featuresCol="features", labelCol="label", predictionCol="prediction") pipeline = Pipeline().setStages([xgboost]) trainDF, testDF = train_data.randomSplit([0.8, 0.2], seed=24) model = pipeline.fit(trainDF) preds = model.transform(testDF) preds.select(col("label"), col("prediction")).show() preds.show()
outputCol="features") # COMMAND ---------- trainingFeatureTest = featureAssembler.transform(trainingFeatureTest) display(trainingFeatureTest.select("Survived", "indexedLabel", "Embarked", "feature2", "features")) # COMMAND ---------- # Train a GBT model. logisticRegression = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="indexedLabel", featuresCol="features") #gbtClassifier = GBTClassifier(labelCol="indexedLabel", featuresCol="features", maxIter=10) # COMMAND ---------- pipeline = Pipeline(stages=[labelIndexer, featureIndexer1, featureIndexer2, featureAssembler, logisticRegression]) # COMMAND ---------- model = pipeline.fit(training) # COMMAND ---------- treeModel = model.stages[-1] # summary only #display(treeModel) # COMMAND ---------- training_predictions = model.transform(training)
# We specify maxCategories so features with > 4 distinct values are treated as continuous. # (maxCategories is not set at the moment, however) # feature_indexer = VectorIndexer(inputCol="features", outputCol="indexed") class_indexer = StringIndexer(inputCol="C4", outputCol="label") # Read in data for sensitivity analysis test_data = sql_context.read.load('tests/resources/iris_test_data.csv', format='com.databricks.spark.csv', header='false', inferSchema='true') # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="features", labelCol="label") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[assembler, class_indexer, dt]) # Train model. This also runs the indexer. model = pipeline.fit(data) # Get our data_info frame, courtesy of PSAML cols_to_analyze = ['C0', 'C1', 'C2', 'C3'] data_info = psaml.make_data_info(sql_context, test_data, cols_to_analyze, 'C4') # Make predictions. predictions = psaml.do_continuous_input_analysis(sc, model, 5, 5, data_info) # Select example rows to display. # predictions.show() # opt param: number of records to show
outputCol="embarkedVec") # Create the vector structured data (label,features(vector)) assembler = VectorAssembler(inputCols=[ "Pclass", "sexVec", "Age", "SibSp", "Parch", "Fare", "embarkedVec" ], outputCol="features") from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="Survived", featuresCol="features", maxIter=10) # set up pipeline pipeline = Pipeline(stages=[ genderIndexer, embarkIndexer, genderEncoder, embarkEncoder, assembler, lr ]) # split the data from pyspark.ml.tuning import TrainValidationSplit train, test = df.randomSplit([0.7, 0.3], seed=41) # fit the model model = pipeline.fit(train) # make prediction predictions = model.transform(test) lrmodel = model.stages[-1] print("Coefficients: " + str(lrmodel.coefficientMatrix))
'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary' ], outputCol="features") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) #model preparation #create Logistic Regression object LR = LogisticRegression(labelCol='Exited', featuresCol='scaledFeatures', predictionCol='Prediction') #create pipeline pipeline = Pipeline(stages=[assembler, scaler, LR]) #train model model = pipeline.fit(ds_Churn_Modelling) #make prediction predictions = model.transform(ds_Churn_Modelling) predictions.select('Prediction', 'Exited').show() #save the model in hdfs. #Note - you need a valid hdfs location with livy permission model.write().overwrite().save('/LR_chrun_modelling_pyspark')
def main(): # Initialize spark and MLOps spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate() mlops.init(spark.sparkContext) # parse the arguments to component options = parse_args() print("PM: Configuration:") print("PM: Number of trees: [{}]".format(options.num_trees)) print("PM: Maximum depth: [{}]".format(options.max_depth)) print("PM: Output model: [{}]".format(options.output_model)) print("PM: Temp shared path: [{}]".format(options.temp_shared_path)) # Generate synthetic data using scikit learn num_samples = 50 num_features = 20 num_classes = 3 X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1, n_classes=num_classes, n_clusters_per_class=1, random_state=42) X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features)) feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)] feature_names[0] = "label" # Create a spark dataframe from the synthetic data generated trainingData = spark.createDataFrame( pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names)) # Histogram of label distribution value, counts = np.unique(y, return_counts=True) label_distribution = np.asarray((value, counts)).T column_names = value.astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) # Output label distribution as a BarGraph using MCenter bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Output Health Statistics to MCenter # Report features whose distribution should be compared during inference mlops.set_data_distribution_stat(trainingData) # Fit a random forest classifiction model assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features") layers = [num_features, 5, 4, num_classes] classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth)) pipeline = Pipeline(stages=[assembler, classifier]) model = pipeline.fit(trainingData) predictions = model.transform(trainingData) # Select (prediction, true label) and compute training error evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Report accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Save the spark model SparkPipelineModelHelper() \ .set_shared_context(spark_context=spark.sparkContext) \ .set_local_path(local_path=options.output_model) \ .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \ .save_sparkml_model(model) # Stop spark context and MLOps spark.sparkContext.stop() mlops.done()
[RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageChannelNormalize(123.0, 117.0, 104.0), ImageMatToTensor(), ImageFeatureToTensor()]) preTrainedNNModel = NNModel(Model.loadModel(options.model_path), transformer) \ .setFeaturesCol("image") \ .setPredictionCol("embedding") lrModel = Sequential().add(Linear(1000, 2)).add(LogSoftMax()) classifier = NNClassifier(lrModel, ClassNLLCriterion(), SeqToTensor([1000])) \ .setLearningRate(options.learning_rate) \ .setOptimMethod(Adam()) \ .setBatchSize(options.batch_size) \ .setMaxEpoch(options.nb_epoch) \ .setFeaturesCol("embedding") \ .setCachingSample(False) \ pipeline = Pipeline(stages=[preTrainedNNModel, classifier]) catdogModel = pipeline.fit(trainingDF) predictionDF = catdogModel.transform(validationDF).cache() predictionDF.sample(False, 0.1).show() evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictionDF) # expected error should be less than 10% print("Test Error = %g " % (1.0 - accuracy)) print("finished...") sc.stop()
train_file = sys.argv[1] trainingData = sqlContext.read.format("libsvm").load(train_file) test_file = sys.argv[8] testData = sqlContext.read.format("libsvm").load(test_file) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[featureIndexer, dt]) ''' use union to add first col in testData need to convert row into df need to remove first row in testData, using original.subtract(firstRowDF) ''' tmp_min1 = find_min_label(trainingData.collect()) tmp_min2 = find_min_label(testData.collect()) tmp_min = min(tmp_min1, tmp_min2) # convert dataframe into list test = testData.collect() test = log_sinh_transform(test,tmp_min) # test_id, test = collect_id(test) train = trainingData.collect()
# Module Constants APP_NAME = "reddit-comment-karma-regression" REDDIT_AUG = "swift://reddit3.sjc01/RC_2015-08" REDDIT_SEPT = "swift://reddit3.sjc01/RC_2015-09" if __name__ == "__main__": # Configure Spark sc = SparkContext(appName=APP_NAME) sqlContext = SQLContext(sc) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="body", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # prepare Reddit json files as sql Dataframes for pyspark.ml aug_comments = sqlContext.read.json(REDDIT_AUG) sep_comments = sqlContext.read.json(REDDIT_SEPT) # transform data for log_reg model by dividing karma score by 1000 # error: Classification labels should be in {0 to 8114} Found 2576839 invalid labels. training = aug_comments.select('id', 'body', (aug_comments.score / 1000.0).cast("double").alias('label')) test = sep_comments.select('id', 'body') test_actual = sep_comments.select('id', (sep_comments.score / 1000.0).alias('actual')) model = pipeline.fit(training) prediction = model.transform(test) selected = prediction.select("id", "text", "prediction").join(test_actual, prediction.id == test_actual.id) selected.write.format('json').save("hdfs://master/usr/hadoop/karma_predictions")
# Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and tree in a Pipeline pipeline = Pipeline(stages=[featureIndexer, dt]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)