def writeLumbarReadings(time, rdd): try: # Convert RDDs of the words DStream to DataFrame and run SQL query connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt') sqlContext = SQLContext(rdd.context) if rdd.isEmpty() == False: lumbarReadings = sqlContext.jsonRDD(rdd) lumbarReadingsIntermediate = lumbarReadings.selectExpr("readingID","readingTime","deviceID","metricTypeID","uomID","actual.y AS actualYaw","actual.p AS actualPitch","actual.r AS actualRoll","setPoints.y AS setPointYaw","setPoints.p AS setPointPitch","setPoints.r AS setPointRoll") assembler = VectorAssembler( inputCols=["actualPitch"], # Must be in same order as what was used to train the model. Testing using only pitch since model has limited dataset. outputCol="features") lumbarReadingsIntermediate = assembler.transform(lumbarReadingsIntermediate) predictions = loadedModel.predict(lumbarReadingsIntermediate.map(lambda x: x.features)) predictionsDF = lumbarReadingsIntermediate.map(lambda x: x.readingID).zip(predictions).toDF(["readingID","positionID"]) combinedDF = lumbarReadingsIntermediate.join(predictionsDF, lumbarReadingsIntermediate.readingID == predictionsDF.readingID).drop(predictionsDF.readingID) combinedDF = combinedDF.drop("features") combinedDF.show() combinedDF.write.jdbc("jdbc:mysql://localhost/biosensor", "SensorReadings", properties=connectionProperties) except: pass
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def seg_model_lr(train_data, test_data, regType, num_iter): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Logistic regression Classifier model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5), regType =regType, iterations=num_iter, numClasses=5) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def _convertPythonXToJavaObject(self, X): """ Converts the input python object X to a java-side object (either MatrixBlock or Java DataFrame) Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame """ if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble( self.sparkSession, pdfX, pdfX.columns, self.features_col).select( self.features_col) return df._jdf elif isinstance(X, SUPPORTED_TYPES): return convertToMatrixBlock(self.sc, X) elif hasattr(X, '_jdf') and self.features_col in X.columns: # No need to assemble as input DF is likely coming via MLPipeline return X._jdf elif hasattr(X, '_jdf'): assembler = VectorAssembler( inputCols=X.columns, outputCol=self.features_col) df = assembler.transform(X) return df._jdf else: raise Exception('Unsupported input type')
def predict(self, X): if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) retPDF = retDF.sort('ID').select('prediction').toPandas() if isinstance(X, np.ndarray): return retPDF.as_matrix().flatten() else: return retPDF else: retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))) if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if 'features' in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol='features') df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) # Return DF return retDF.sort('ID') else: raise Exception('Unsupported input type')
def scaleVecCol(self, columns, nameOutputCol): """ This function groups the columns specified and put them in a list array in one column, then a scale process is made. The scaling proccedure is spark scaling default (see the example bellow). +---------+----------+ |Price |AreaLiving| +---------+----------+ |1261706.9|16 | |1263607.9|16 | |1109960.0|19 | |978277.0 |19 | |885000.0 |19 | +---------+----------+ | | | V +----------------------------------------+ |['Price', 'AreaLiving'] | +----------------------------------------+ |[0.1673858972637624,0.5] | |[0.08966137157852398,0.3611111111111111]| |[0.11587093205757598,0.3888888888888889]| |[0.1139820728616421,0.3888888888888889] | |[0.12260126542983639,0.4722222222222222]| +----------------------------------------+ only showing top 5 rows """ # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) # Check if nameOutputCol argument a string datatype: self.__assertTypeStr(nameOutputCol, "nameOutpuCol") # Model to use vectorAssember: vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler") # Model for scaling feature column: mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol) # Dataframe with feature_assembler column tempDF = vecAssembler.transform(self.__df) # Fitting scaler model with transformed dataframe model = mmScaler.fit(tempDF) exprs = list(filter(lambda x: x not in columns, self.__df.columns)) exprs.extend([nameOutputCol]) self.__df = model.transform(tempDF).select(*exprs) self.__addTransformation() # checkpoint in case return self
def convert_to_flat_by_sparkpy(df): subkeys = df.select("subkey").dropDuplicates().collect() subkeys = [s[0] for s in subkeys] assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features") spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference")))) spark_df = spark_df.withColumnRenamed("parameter", "label") spark_df = spark_df.select("label", "features") return spark_df
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def _prepare_data_spark(self, data): """ Prepare data for spark format, output data will have the feature format and other useful information """ keys = list(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) df = self._spark.createDataFrame(data) ass = VectorAssembler(inputCols=keys, outputCol="features") output = ass.transform(df) # output.select('features', 'ChangeDirection', 'ChangeAmount').write.save('test.parquet') return output
def predictPopularity(features): print(features) features = tuple(features) feature_label = [] for i in range(0, len(features)): feature_label.append('feature' +str(i)) data_frame = spark.createDataFrame([features], feature_label) assembler = VectorAssembler(inputCols= feature_label, outputCol = 'features') data_frame = assembler.transform(data_frame) data_frame = data_frame.select('features') result = rfc_model.transform(data_frame) return result.select('prediction').head(1)[0][0]
def test_train_data(overall_segment): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(overall_segment.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(overall_segment.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) (trainingData, testData) = sc.parallelize(data_train.collect(),5).randomSplit([0.7, 0.3]) return (trainingData, testData)
def commit(self): self.update_domain_role_hints() if self.in_df is not None: attributes = [att for att in self.used_attrs._list] class_var = [var for var in self.class_attrs._list] metas = [meta for meta in self.meta_attrs._list] VA = VectorAssembler(inputCols = attributes, outputCol = 'features') self.out_df = VA.transform(self.in_df) if len(class_var): self.out_df = self.out_df.withColumn('label', self.out_df[class_var[0]].cast('double')) self.send("DataFrame", self.out_df) else: self.send("DataFrame", None)
def tf_idf_features_quora(p_df): """ Extracts TF-IDF features from quora dataset. :param p_df: A DataFrame. :return: A DataFrame. """ tf_df = extract_tf_features(p_df, "question1_meaningful_words", "tf1") tf_df = extract_tf_features(tf_df, "question2_meaningful_words", "tf2") tf_idf_df = extract_idf_features(tf_df, "tf1", "tf-idf1") tf_idf_df = extract_idf_features(tf_idf_df, "tf2", "tf-idf2") assembler = VectorAssembler( inputCols=["tf-idf1", "tf-idf2"], outputCol="tf_idf_features" ) return assembler.transform(tf_idf_df)
def convert_to_flat_by_sparkpy(df): subkeys = df.select("subkey").dropDuplicates().collect() subkeys = [s[0] for s in subkeys] n = len(df.select("reference").first()[0]) # df = df.groupBy("key").agg(array(*[avg(col("reference")[i]) for i in range(n)]).alias("averages")) df = df.groupBy("key").agg(array(*[collect_list(col("reference")[i]) for i in range(n)]).alias("averages")) df.show() r = df.collect() # changedTypedf = joindf.withColumn("label", joindf["show"].cast(DoubleType())) assembler = VectorAssembler().setInputCols(subkeys).setOutputCol("features") spark_df = assembler.transform(df.groupBy("key", "parameter").pivot("subkey").agg(first(col("reference")))) spark_df = spark_df.withColumnRenamed("parameter", "label") spark_df = spark_df.select("label", "features") return spark_df
def predict(self, X): """ Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame """ try: if self.estimator is not None and self.model is not None: self.estimator.copyProperties(self.model) except AttributeError: pass if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sparkSession, pdfX, pdfX.columns, self.features_col).select(self.features_col) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) retPDF = retDF.sort('__INDEX').select('prediction').toPandas() if isinstance(X, np.ndarray): return self.decode(retPDF.as_matrix().flatten()) else: return self.decode(retPDF) else: try: retNumPy = self.decode(convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))) except Py4JError: traceback.print_exc() if isinstance(X, np.ndarray): return retNumPy else: return retNumPy # TODO: Convert to Pandas elif hasattr(X, '_jdf'): if self.features_col in X.columns: # No need to assemble as input DF is likely coming via MLPipeline df = X else: assembler = VectorAssembler(inputCols=X.columns, outputCol=self.features_col) df = assembler.transform(X) retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sparkSession) # Return DF return retDF.sort('__INDEX') else: raise Exception('Unsupported input type')
def transform(self, df, featureCols, targetCol): """Keep the K most important features of the Spark DataFrame Parameters ---------- df : Spark DataFrame featureCols: array, names of feature columns to consider in the feature selectio algorithm targetCol: str, name of target column, i.e, column to which compare each feature. Returns ------- transformed_df : New Spark DataFrame with only the most important feature columns. """ # build features assemble assembler = VectorAssembler( inputCols = featureCols, outputCol = 'features') assembled_df = assembler.transform(df) # rename target column assembled_df = assembled_df.withColumnRenamed(targetCol,'target') # extract features and target feats = assembled_df.select('features').rdd feats = feats.map(lambda x: x['features']) target = assembled_df.select('target').rdd target = target.map(lambda x: x['target']) # compute per-column metric scores = [] for i,feat in enumerate(featureCols): vector = feats.map(lambda x: x[i]) scores.append(self.sfunc_(vector,target)) self.scores_ = scores # sort scores idx = sorted(range(len(self.scores_)),reverse=True,key=self.scores_.__getitem__) # return dataframe with k-best columns return df.select(*[featureCols[idd] for idd in idx[:self.k_]])
def convertToLabeledDF(sparkSession, X, y=None): from pyspark.ml.feature import VectorAssembler if y is not None: pd1 = pd.DataFrame(X) pd2 = pd.DataFrame(y, columns=['label']) pdf = pd.concat([pd1, pd2], axis=1) inputColumns = ['C' + str(i) for i in pd1.columns] outputColumns = inputColumns + ['label'] else: pdf = pd.DataFrame(X) inputColumns = ['C' + str(i) for i in pdf.columns] outputColumns = inputColumns assembler = VectorAssembler(inputCols=inputColumns, outputCol='features') out = assembler.transform(sparkSession.createDataFrame(pdf, outputColumns)) if y is not None: return out.select('features', 'label') else: return out.select('features')
def merge_features(ddfs, join_column, merge_column, output_column='features', drop_merged_columns=True): """ join (inner) several DataFrames by same id and merge its columns (merge_column) into one column using using pyspark.ml.feature.VectorAssembler Example: ddf_merge = merge_features(ddfs=[ddf_pivot1,ddf_pivot2], join_column='customer_id', merge_column='features') :param ddfs: :param join_column: id column to join by (each ddf must have this column) :param merge_column: column to merge (each ddf must have this column) :param output_column: :param drop_merged_columns: :return: """ from pyspark.ml.feature import VectorAssembler ddf_res = ddfs.pop(0) merge_column_renamed = merge_column + str(0) merge_columns = [merge_column_renamed] ddf_res = ddf_res.withColumnRenamed(merge_column, merge_column_renamed) for i,ddf in enumerate(ddfs): merge_column_renamed = merge_column + str(i+1) merge_columns.append(merge_column_renamed) ddf_r = ddf.withColumnRenamed(merge_column, merge_column_renamed) ddf_res = ddf_res.join(ddf_r, on=join_column, how='inner') assembler = VectorAssembler(inputCols=merge_columns, outputCol=output_column) res = assembler.transform(ddf_res) if drop_merged_columns: res = drop_columns(res, columns=merge_columns) return res # def pivot_aggregate(ddf, grpby_columns, pivot_column, aggs, pivot_filter_values=None, pivot_filter_support=None): # if pivot_filter_support and not pivot_filter_values: # frequent = ddf.freqItems([pivot_column], support=pivot_filter_support).first().asDict()[pivot_column+'_freqItems'] # pivot_filter_values = map(str,frequent) # # ddf_gr = ddf.groupBy(*grpby_columns) # ddf_pivot = ddf_gr.pivot(pivot_column, pivot_filter_values) # ddf_agg = ddf_pivot.agg(*aggs) # return ddf_agg
def preprocess(data): data = data.select('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime','UniqueCarrier'\ ,'FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay','DepDelay', 'Origin'\ ,'Dest','Distance','TaxiIn','TaxiOut','Cancelled') data = data.na.fill('999999') for t in data.dtypes: if t[1]=='string' and t[0] not in ['Origin','Dest','TailNum','UniqueCarrier','FlightNum']: data=data.withColumn(t[0],x[t[0]].cast('integer')) data = data.na.fill(999999) data = data.withColumnRenamed('Cancelled','label') data = data.withColumn('label',data.label.cast('double')) assembler = VectorAssembler( inputCols=['Year','Month','DayofMonth','DayOfWeek' ,'DepTime','CRSDepTime','ArrTime','CRSArrTime', 'ActualElapsedTime','CRSElapsedTime','AirTime', 'ArrDelay','DepDelay','Distance','TaxiIn','TaxiOut'], outputCol='features') data = assembler.transform(data) data = data.select('features','label') return data
def to_numeric_df(kdf: 'ks.DataFrame') -> Tuple[pyspark.sql.DataFrame, List[str]]: """ Takes a dataframe and turns it into a dataframe containing a single numerical vector of doubles. This dataframe has a single field called '_1'. TODO: index is not preserved currently :param kdf: the koalas dataframe. :return: a pair of dataframe, list of strings (the name of the columns that were converted to numerical types) >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']})) (DataFrame[_correlation_output: vector], ['A', 'B']) """ # TODO, it should be more robust. accepted_types = {np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]} numeric_fields = [fname for fname in kdf._metadata.data_columns if kdf[fname].dtype in accepted_types] numeric_df = kdf._sdf.select(*numeric_fields) va = VectorAssembler(inputCols=numeric_fields, outputCol=CORRELATION_OUTPUT_COLUMN) v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN) return v, numeric_fields
def cluster(): ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8')) spark = SparkSession.builder\ .master("local")\ .appName("Word Count")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"]) df.show() vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features") new_df = vecAssembler.transform(df) kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) print(transformed.show())
modelVars.remove('AgeCb') modelVars.remove('CurrentLimit') modelVars.remove('InitialLimit') modelVars.remove('LimitReached') modelVars.remove('InitialLimitEqualsCurrent') modelVars.remove('LimitReachedInLast3Months') modelVars.remove('LimitChangedInLast3Months') modelVars.remove('ExcessPaymentAmmountCurrent') modelVars.remove('NumberOfExcessPayments3M') modelVars.remove('ExcessPaymentAmmount3M') modelVars.remove('StartDate') ############################################################# #### Assemble all feature columns into one vector column #### #### called "features" and transform to RDD of lists #### ############################################################# allToOne = VectorAssembler(inputCols = modelVars, outputCol = "features") assembledRDD = allToOne.transform(MyTrain.select(modelVars)).select("features").rdd.map(lambda line: line[0]).persist() #### Calculate correlation matrix and set it to write mode #### corM = Statistics.corr(assembledRDD) corM.setflags(write = True) fill_diagonal(corM, 0.0) ####################################################### #### Iterate by rows of correlation matrix #### #### If there is a value greater than threshold #### #### in the current row set the row and the column #### #### with same index to 0. #### ####################################################### nVar = corM.shape[0] corToMod = array(corM.tolist()) # This is to ensure that a copy is made
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show(truncate=False) spark.stop()
spark = SparkSession.builder.appName('Popularity').getOrCreate() data = spark.read.csv('OnlineNewsPopularity.csv', inferSchema=True, header=True) from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'timedelta', 'n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length', 'num_keywords', 'data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world', 'self_reference_max_shares', 'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'global_subjectivity', 'global_sentiment_polarity', 'title_subjectivity', 'title_sentiment_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity' ], outputCol='features') new_data = assembler.transform(data) final_data = new_data.select('features', 'shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import pymongo_spark pymongo_spark.activate() conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def preprocess_test(test, model=None): # test = test.dropna(axis=1, how='all', inplace=False) # for c in test.columns: # if test.filter(col(c).isNotNull()).count() == 0: # test = test.drop(c) print('Length of test : ' + str(len(test.columns))) if model == 'xgb': cols = [x for x in test.columns if x not in ['datetime']] print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) test = clip(test, cols) # test = test.resample('H').mean() # test = test.rolling(window=50).mean() test = get_mean_of_cyl_values(test) test = test.fillna(0) return test elif model == 'lstm': cols = [x for x in test.columns if x not in ['datetime']] print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) test = clip(test, cols) test = get_mean_of_cyl_values(test) print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) print(test.schema) test = test.fillna(0) cols = [x for x in test.columns if x not in ['datetime']] assembler = VectorAssembler().setInputCols \ (cols).setOutputCol("features") print('assembler') transformed = assembler.transform(test) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(transformed) scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(l1NormData) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(l1NormData) # train = scaledData.drop(*cols) del test, transformed, l1NormData n_components_ = 50 pca = PCA(k=n_components_, inputCol="scaledFeatures", outputCol="pcaFeatures") model = pca.fit(scaledData) vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime']) print(vds_5) def extract(row): return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist()) vds_5 = vds_5.rdd.map(extract).toDF(["datetime"]) print(vds_5) vds_5 = vds_5.drop(*['pcaFeatures', 'datetime']) return vds_5 elif model == 'svm': # test = test.toPandas() # test = clip_data(test) cols = [x for x in test.columns if x not in ['datetime']] print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) test = clip(test, cols) print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) # test = test.toPandas() # test_max = test.resample('H').max().add_suffix('_max') # test_min = test.resample('H').min().add_suffix('_min') # test_std = test.resample('H').std().add_suffix('_std') # test = test.resample('H').mean() # # test = pd.concat([test, test_max], axis=1, sort=False) # test = pd.concat([test, test_min], axis=1, sort=False) # test = pd.concat([test, test_std], axis=1, sort=False) # del test_max, test_min, # gc.collect() # test = test.toHandy() test = get_mean_of_cyl_values(test) # vds_5 = test print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) # vds_5 = vds_5.replace(to_replace=0, value=1) # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill') # window = Window.orderBy('datetime') \ # .rowsBetween(-sys.maxsize, 0) # # def ffill(column): # return last(column, ignorenulls=True).over(window) # # def bfill(column): # return last(column, ignorenulls=True).over(window) # # for column in cols: # vds_5 = vds_5.withColumn(column,ffill(col(column))) # # for column in cols: # vds_5 = vds_5.withColumn(column,bfill(col(column))) test = test.fillna(0) # vds_5 = vds_5.fillna(method='ffill') # vds_5 = vds_5.fillna(method='bfill') return test elif model == 'perm': # test = test.resample('H').mean() # test = test.rolling(window=20).mean() cols = [x for x in test.columns if x not in ['datetime']] print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) test = test.fillna(0) test = clip(test, cols) # window = Window.orderBy('datetime') \ # .rowsBetween(-sys.maxsize, 0) # # def ffill(column): # return last(column, ignorenulls=True).over(window) # # def bfill(column): # return last(column, ignorenulls=True).over(window) # # for column in cols: # test = test.withColumn(column,ffill(col(column))) # # for column in cols: # test = test.withColumn(column,bfill(col(column))) test = test.fillna(0) return test
.builder\ .appName("VectorSizeHintExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint( inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
# In[ ]: # Load the training data into a dataframe data = spark.read.format('json').load('train.jsonl') data = clean_tokenize_remove_stopwords_quora(data) # Get the tf-idf features data = tf_idf_features_quora(data) # Get the text features data = text_features(data) # combine all the features feature_assembler = VectorAssembler( inputCols=["tf_idf_features", "text_features"], outputCol="combined_features" ) data = feature_assembler.transform(data) # Normalizing each feature to have unit standard deviation scaler = StandardScaler(inputCol="combined_features", outputCol="features", withStd=True, withMean=False) scalerModel = scaler.fit(data) # Normalize each feature to have unit standard deviation. data = scalerModel.transform(data) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
from pyspark.mllib.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.evaluation import RegressionEvaluator import os df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json") df_restaurants = df.filter("category = \"Restaurants\"") assembler = VectorAssembler( inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ], outputCol="features") output = assembler.transform(df_restaurants) (trainingData, testData) = output.randomSplit([0.7, 0.3]) dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features") pipeline = Pipeline(stages=[dt]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "elite", "features").show(5) evaluator = RegressionEvaluator( labelCol="elite", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions)
numColumns = [ item[0] for item in df.dtypes if not item[1].startswith('string') ] catColVectors = [c + '_vector' for c in catColumns] # Change categorical values into numeric indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index") for column in catColumns ] encoder = OneHotEncoderEstimator( inputCols=[c + "_index" for c in catColumns], outputCols=[c + "_vector" for c in catColumns]) assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns, outputCol="features") label_stringIdx = StringIndexer(inputCol="income", outputCol="label") pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler]) encoded_df = pipeline.fit(df).transform(df) selectedCols = ['label', 'features'] + cols dataset = encoded_df.select(selectedCols) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # Fit model and train
def with_features(raw_df, feature_cols): vector_assembler = VectorAssembler().setInputCols(feature_cols).setOutputCol('features') pipeline = Pipeline().setStages([vector_assembler]) df = pipeline.fit(raw_df).transform(raw_df) return df
out # In[19]: df_train.dtypes # In[13]: from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler # In[14]: featureassembler = VectorAssembler(inputCols=[ "a", "area", "ci", "pi", "eccentricity", "kx", "ky", "m00", "m01", "m10", "minAreaPercent", "minEnclosingCircleArea", "mu02", "mu03", "mu11", "mu20", "mu30", "sx", "sy", "d" ], outputCol="features") # In[20]: output = featureassembler.transform(df_train) # In[21]: output.select("features").show(5) # In[344]: output.columns
def Logistic_regression(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() label = '' for y in label_colm: label = y print(label) # using the rformula for indexing, encoding and vectorising # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # extracting the schema val = dataset.schema string_features = [] integer_features = [] for x in val: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: string_features.append(x.name) else: for y in feature_colm: if x.name == y: integer_features.append(x.name) print(string_features) print(integer_features) print(val) # print(label) # label = 'y' for z in val: if (z.name == label and str(z.dataType) == "StringType"): label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) if (z.name == label and str(z.dataType) == ("IntegerType" or "FloatType" or "DoubleType")): dataset = dataset.withColumnRenamed(label, 'indexed_' + label) ########################################################################### indexed_features = [] encoded_features = [] for col in string_features: indexer = StringIndexer(inputCol=col, outputCol='indexed_' + col).fit(dataset) indexed_features.append('indexed_' + col) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset) # encoded_features.append('encoded_'+col) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = integer_features + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # combining both the features colm together # output.show() # output.select("features").show() # output_features = dataset.select("features") # using the vector indexer (for categorical data kind of one hot encoding) vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=15).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select('indexed_' + label, 'vec_indexed_features') finalized_data.show() # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # splitting the dataset into train and test train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) # implementing the logistic regression # lr1 =LogisticRegression() Accuracy_list = [] # Accuracy_list.append(accuracy) FPR_list = [] # FPR_list.append(falsePositiveRate) TPR_list = [] precision_list = [] recall_list = [] y = 0.1 # x=[] for i in range(0, 3): y = round(y + 0.1, 2) lr = LogisticRegression(featuresCol='vec_indexed_features', labelCol='indexed_' + label, maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3) # fit the model lrModel = lr.fit(train_data) lrModel # print the coefficients and the intercept for the logistic regression print("coefficients:" + str(lrModel.coefficientMatrix)) # mat = (lrModel.coefficientMatrix) # print mat print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model # f-measure calculation from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print(" area under roc : ", training_summary.areaUnderROC) print(" roc : ", training_summary.roc) roc = training_summary.roc roc.show() print(" pr value : ", training_summary.pr) pr = training_summary.pr pr.show() print(" precision by threshold : ", training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print(" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print(accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print(objective) # for a multiclass we can inspect a matrix on a per label basis print("false positive rate by label:") for i, rate in enumerate( training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) # # print("True Negative rate") # for i, rate in enumerate(training_summary) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # Accuracy_list = [] Accuracy_list.append(accuracy) # FPR_list = [] FPR_list.append(falsePositiveRate) # TPR_list=[] TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print(Accuracy_list) print(FPR_list) print(TPR_list) print(precision_list) print(recall_list) import matplotlib.pyplot as plt # # plt.plot(recall_list, FPR_list) # plt.show() # # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ] # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395 , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0] # data visualization # ROC graph fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() # PR graph pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision, pr_recall) plt.show() # now applying the fit on the test data prediction_val = lrModel.transform(test_data) prediction_val.groupBy('indexed_' + label, "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
filterer = SQLTransformer(statement="select * from __THIS__ where cancelled = 0") # Cast `star_rating` to double for the Binarizer: converter = SQLTransformer(statement="select *, cast(star_rating as double) as star_rating_double from __THIS__") # Binarize `star_rating_double`: from pyspark.ml.feature import Binarizer binarizer = Binarizer(inputCol="star_rating_double", outputCol="five_star_rating", threshold=4.5) # Extract the `reviewed` feature: extractor = SQLTransformer(statement="select *, review is not null as reviewed from __THIS__") # Assemble the features: from pyspark.ml.feature import VectorAssembler selected = ["reviewed"] assembler = VectorAssembler(inputCols=selected, outputCol="features") # Specify the decision tree classifier: from pyspark.ml.classification import DecisionTreeClassifier classifier = DecisionTreeClassifier(featuresCol="features", labelCol="five_star_rating") # Specify the pipeline: from pyspark.ml import Pipeline stages = [filterer, converter, binarizer, extractor, assembler, classifier] pipeline = Pipeline(stages=stages) # ## Save and load the machine learning pipeline # Save the `Pipeline` instance to our local directory in HDFS: pipeline.write().overwrite().save("models/pipeline")
def preprocess_train(train, model=None, spark=None): if model == 'xgb': # train = train.dropna(axis=1, how='all', inplace=False) cols = [x for x in train.columns if x not in ['datetime']] print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) train = clip(train, cols) # train = train.resample('H').mean() train = get_mean_of_cyl_values(train) train = train.fillna(0) # train.show(n=5) return train elif model == 'lstm': # train = train.dropna(axis=1, how='all', inplace=False) cols = [x for x in train.columns if x not in ['datetime']] print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) train = clip(train, cols) train = get_mean_of_cyl_values(train) # train_max = train.resample('H').max().add_suffix('_max') # train_min = train.resample('H').min().add_suffix('_min') # train_std = train.resample('H').std().add_suffix('_std') # train = train.resample('H').mean() # # train = pd.concat([train, train_max], axis=1, sort=False) # train = pd.concat([train, train_min], axis=1, sort=False) # train = pd.concat([train, train_std], axis=1, sort=False) # del train_max, train_min, # gc.collect() # train = train.rolling(window=150).mean() cols = [x for x in train.columns if x not in ['datetime']] # function to calculate number of seconds from number of days days = lambda i: i * 86400 # # train = train.withColumn('datetime', train.datetime.cast('timestamp')) # # # create window by casting timestamp to long (number of seconds) # w = (Window.orderBy('datetime').rowsBetween(-50, 0)) # for column in cols: # train = train.withColumn(column, avg(train[column]).over(w)) print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) print(train.schema) train = train.fillna(0) # # window = Window.orderBy('datetime') \ # .rowsBetween(-sys.maxsize, 0) # # def ffill(column): # return last(column, ignorenulls=True).over(window) # # def bfill(column): # return last(column, ignorenulls=True).over(window) # # for column in cols: # train = train.withColumn(column, ffill(col(column))) # # for column in cols: # train = train.withColumn(column, bfill(col(column))) # train = train.fillna(0) # # vds_5 = train # del train # gc.collect() # # vds_5 = vds_5.replace(to_replace=0, value=1) # # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill') # # # vds_5 = vds_5.fillna(method='ffill') # vds_5 = vds_5.fillna(method='bfill') cols = [x for x in train.columns if x not in ['datetime']] # vds_55 = normalize(vds_5) # vds_55 = scale(vds_55) assembler = VectorAssembler().setInputCols \ (cols).setOutputCol("features") print('assembler') transformed = assembler.transform(train) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(transformed) scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(l1NormData) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(l1NormData) # train = scaledData.drop(*cols) del train, transformed, l1NormData n_components_ = 50 # pca = FastICA(n_components=n_components_) # # dump(pca, 'pca.joblib') # # pca2_results = pca.fit_transform(scaledData) # # n_comp=pca.n_components_ # n_comp = n_components_ # print('Number of componeds : ' + str(n_comp)) # print(pca2_results) # print (len(pca2_results[:, 1])) # for i in range(0, n_comp): # vds_5['pca_' + str(i)] = 0 # # print(len(vds_5['pca_' + str(i)])) # # print(len(pca2_results[:, i])) # vds_5['pca_' + str(i)] = pca2_results[:, i] # pca_columns = [x for x in vds_5.columns if x.startswith('pca')] # vds_5 = vds_5[pca_columns] pca = PCA(k=n_components_, inputCol="scaledFeatures", outputCol="pcaFeatures") model = pca.fit(scaledData) vds_5 = model.transform(scaledData).select(['pcaFeatures', 'datetime']) print(vds_5) def extract(row): return (row.datetime, ) + tuple(row.pcaFeatures.toArray().tolist()) vds_5 = vds_5.rdd.map(extract).toDF(["datetime"]) print(vds_5) vds_5 = vds_5.drop(*['pcaFeatures', 'datetime']) return vds_5 elif model == 'svm': cols = [x for x in train.columns if x not in ['datetime']] print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) train = clip(train, cols) print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) # train_max = train.resample('H').max().add_suffix('_max') # train_min = train.resample('H').min().add_suffix('_min') # train_std = train.resample('H').std().add_suffix('_std') # train = train.resample('H').mean() # # train = pd.concat([train, train_max], axis=1, sort=False) # train = pd.concat([train, train_min], axis=1, sort=False) # train = pd.concat([train, train_std], axis=1, sort=False) # del train_max, train_min, # gc.collect() train = get_mean_of_cyl_values(train) vds_5 = train print('Test Columns : ' + str(len(train.columns))) print('Test Rows : ' + str(train.count())) # vds_5 = vds_5.replace(to_replace=0, value=1) # vds_5 = vds_5.pct_change(periods=1, fill_method='ffill') # window = Window.orderBy('datetime') \ # .rowsBetween(-sys.maxsize, 0) # # def ffill(column): # return last(column, ignorenulls=True).over(window) # # def bfill(column): # return last(column, ignorenulls=True).over(window) # # for column in cols: # vds_5 = vds_5.withColumn(column, ffill(col(column))) # # for column in cols: # vds_5 = vds_5.withColumn(column, bfill(col(column))) vds_5 = vds_5.fillna(0) # vds_5 = vds_5.fillna(method='ffill') # vds_5 = vds_5.fillna(method='bfill') return vds_5
data = spark.read.format("csv").option("header", True).option( "inferSchema", True ).option("delimiter", ",").load( "/home/charan/workspaces/big_data_programming/bigdata_progamming_m2_icp/icp7/apps/datasets/adult.data" ) # data = data.select("*", F.when(data.X == ' <=50K', 1).when(data.X == ' >50K', 2).otherwise(0).alias('label')) data = data.withColumnRenamed("age", "label").select("label", "education-num", "hours-per-week") data = data.select(data.label.cast("double"), "education-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.85, 0.15]) # Create Navie Bayes model and fit the model with training dataset nb = NaiveBayes() model = nb.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evaluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions)
""" clean_riskdata = nostring_riskdata.na.fill({"loan":avg("loan") , "mortdue":avg("mortdue"), "value":avg("value"), "derog":avg("derog"), "delinq":0, "clage":avg("clage"), "ninq":avg("ninq"), "clno":avg("clno"), "debtinc":avg("debtinc") }) """ #Define Input-output columns, i.e. transform to MLP features vector ignore=['bad'] assembler = VectorAssembler( inputCols=[k for k in clean_riskdata.columns if k not in ignore], outputCol="predictors") Triskdata = assembler.transform(clean_riskdata) # Split the data into train and test splits = Triskdata.randomSplit([0.4, 0.6], 1234) train = splits[0] test = splits[1] ################################################################# # Preliminary analysis ################################################################# print(clean_riskdata.describe().show()) print(riskdata.stat.crosstab("bad","job").show()) print(riskdata.stat.crosstab("bad","reason").show()) ################################################################# # Multilayer Perceptron Classifier
from pyspark.ml.feature import VectorAssembler va = VectorAssembler()\ .setInputCols(["Quantity", "UnitPrice"])\ .setOutputCol("features") sales = va.transform(spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") .load("/data/retail-data/by-day/*.csv") .limit(50) .coalesce(1) .where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ")
# this will convert each unique string into a numeric #indexer = StringIndexer(inputCol="property_state", outputCol="loc_state") #indexed = indexer.fit(lndf).transform(lndf) # indexed.show(5) ## First try a logistic regression # now we need to create a "label" and "features" # input for using the sparkML library ## This runs in the Cloudera Spark Cluster from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vectors # # the debt to income col has nulls assembler = VectorAssembler( inputCols=[ "sensor1", "sensor2", "sensor3", "sensor4" ], outputCol="features") # note the column headers - label and features are keywords lrdf = assembler.transform(iotdf) lrdf.show(5) lrdf.count() from pyspark.ml.classification import LogisticRegression # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
#remove data_df from memory data_df.unpersist() #encode the dependent variable - category_predict classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index") classifymodel = classifyIndexer.fit(encoded) encoded2 = classifymodel.transform(encoded) #keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim #drop the following cleaned = encoded2.select([c for c in encoded2.columns if c not in{'DayOfWeek','Category','Address','Dates','Descript','PdDistrict','Resolution','PdDistrict_Index'}]) ignore = ['Category_Index'] assembler = VectorAssembler(inputCols=[x for x in cleaned.columns if x not in ignore],outputCol='features') transformed = assembler.transform(cleaned) data_transformed = transformed.select(col("Category_Index").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features)) #******************************************************************************** # split the training set train, test = data_transformed.randomSplit([0.7, 0.3], seed = 2) #naivebayes classifier #lambda = 1.0 # initialize classifier: nb_model = mllib_class.NaiveBayes.train(train, 1.0) #this step will take 50 seconds
from pyspark.ml.classification import RandomForestClassificationModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.shell import spark from pyspark.ml.feature import VectorAssembler from pyspark.sql.types import DoubleType df=spark.read.format("csv").option("inferSchema", "true").option("header", "true").option("sep", ";")\ .load("TestDataset.csv") ###RENAMING THE CLOUMNS### df = df.toDF("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10", "c11", "quality") featureassembler = VectorAssembler(inputCols=[ "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10", "c11" ], outputCol="Independent Features") output = featureassembler.transform(df) test_data = output.select("Independent Features", "quality") #### LOADING_MODEL AND EVALUATING MODEL##### reg = RandomForestClassificationModel.load("ModelV1") pred = reg.transform(test_data) pred.select('Independent Features', "quality", 'prediction').show(5) evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy") Accuracy = evaluator.evaluate(pred)
def number_transformers(spark): return [ VectorAssembler(), StandardScaler(), ]
# Alternative way of filtering data #data.registerTempTable("taxi") #df_sql = sqlContext.sql("SELECT * FROM taxi WHERE passenger_count >0 and passenger_count < 10 and trip_time_in_secs > 0 and trip_time_in_secs < 3000 and trip_distance > 0 AND trip_distance < 25 AND fare_amount > 0 AND fare_amount < 50 AND total_amount>0 AND total_amount < 100 and tip_amount > 0 and tip_amount < 20") #df_sql.show() # Show my result # df_sql.registerTempTable("taxi_clean") # sqlContext.sql("SELECT count(*) FROM taxi_clean").show()# calculate the number of rows ############## regression from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml.evaluation import RegressionEvaluator # Merge all needed columns as one column called features assembler = VectorAssembler(inputCols = ['trip_time_in_secs', 'trip_distance'], outputCol="features") # Select features and rename total_amount as label. regression_data = assembler.transform(data_cleaned).select([col for col in data_cleaned.columns if col != "total_amount"]+["features",data_cleaned["total_amount"].alias("label")] ) regression_data.show() # Setup the linear regression solver lr = LinearRegression(maxIter=1000, regParam=0.3, elasticNetParam=0) # Fit the model lrModel = lr.fit(regression_data) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations)
from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorAssemblerExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
def assemble_features_pipeline_model(df, features, label, algorithm, set_features, set_label, prediction, keep, emit, task_id): """ Prepare features and label to be processed by a ML algorithm. Features and labels are indexed (StringIndexer) if they are categorical. During the process, temporary columns are created but they're removed as soon as the pipeline ends. :arg df Input data frame :arg features array with column names to be used as features :arg label name of the column with label :arg algorithm algorithm to be used, can be a ML or a feature extraction one :arg set_features name of the method used to set the features :arg set_label name of the method used to set the label :arg prediction name of the prediction column (generated) :arg keep list of the columns to be kept after the processing :arg emit emit messages function :arg task_id task identifier :returns processing pipeline model """ if keep is None: keep = [] final_keep = [c.name for c in df.schema] final_keep.extend(keep) clean_null_rows = 'SELECT * FROM __THIS__ WHERE {}' if len(features) > 1 and not isinstance( df.schema[str(features[0])].dataType, VectorUDT): emit(name='update task', message=_( 'Features are not assembled as a vector. They will be ' 'implicitly assembled and rows with null values will be ' 'discarded. If this is undesirable, explicitly add a ' 'attribute vectorizer, handle missing data and ' 'categorical attributes in the workflow.'), level='warning', status='RUNNING', identifier=task_id) stages = [] to_assemble = [] for f in features: if not dataframe_util.is_numeric(df.schema, f): name = f + '__tmp__' to_assemble.append(name) stages.append( StringIndexer(inputCol=f, outputCol=name, handleInvalid='keep')) else: to_assemble.append(f) # Remove rows with null (VectorAssembler doesn't support it) cond = ' AND '.join(['{} IS NOT NULL '.format(c) for c in to_assemble]) stages.append(SQLTransformer(statement=clean_null_rows.format(cond))) final_features = 'features__tmp__' stages.append( VectorAssembler(inputCols=to_assemble, outputCol=final_features)) getattr(algorithm, set_features)(final_features) if label is not None: final_label = '{}__tmp__'.format(label) getattr(algorithm, set_label)(final_label) stages.append( StringIndexer(inputCol=label, outputCol=final_label, handleInvalid='keep')) stages.append(algorithm) pipeline = Pipeline(stages=stages) model = pipeline.fit(df) last_stages = [model] if label is not None: last_stages.append( IndexToString(inputCol=prediction, outputCol='{}'.format(prediction), labels=model.stages[-2].labels)) # Remove temporary columns sql = 'SELECT {} FROM __THIS__'.format(', '.join(final_keep)) last_stages.append(SQLTransformer(statement=sql)) pipeline = Pipeline(stages=last_stages) model = pipeline.fit(df) else: if label is not None: final_label = '{}__tmp__'.format(label) getattr(algorithm, set_label)(final_label) stages = [ StringIndexer(inputCol=label, outputCol=final_label, handleInvalid='keep'), algorithm ] pipeline = Pipeline(stages=stages) model = pipeline.fit(df) last_stages = [model] if label is not None: last_stages.append( IndexToString(inputCol=final_label, outputCol='{}_str'.format(prediction), labels=model.stages[-2].labels)) # Remove temporary columns sql = 'SELECT {} FROM __THIS__'.format(', '.join(final_keep)) last_stages.append(SQLTransformer(statement=sql)) pipeline = Pipeline(stages=last_stages) model = pipeline.fit(df) else: getattr(algorithm, set_features)(features[0]) model = algorithm.fit(df) return model
.options(header='true', inferSchema='true') \ .load(path_train) # Read the testing dataset. raw_dataset_test = reader.read.format('com.databricks.spark.csv') \ .options(header='true', inferSchema='true') \ .load(path_test) # First, we would like to extract the desired features from the raw dataset. # We do this by constructing a list with all desired columns. # This is identical for the test set. features = raw_dataset_train.columns features.remove('label') # Next, we use Spark's VectorAssembler to "assemble" (create) a vector of all desired features. # http://spark.apache.org/docs/latest/ml-features.html#vectorassembler vector_assembler = VectorAssembler(inputCols=features, outputCol="features") # This transformer will take all columns specified in features, and create an additional column # "features" which will contain all the desired features aggregated into a single vector. dataset_train = vector_assembler.transform(raw_dataset_train) dataset_test = vector_assembler.transform(raw_dataset_test) # Define the number of output classes. nb_classes = 10 encoder = OneHotTransformer(nb_classes, input_col="label", output_col="label_encoded") dataset_train = encoder.transform(dataset_train) dataset_test = encoder.transform(dataset_test) # Allocate a MinMaxTransformer from Distributed Keras to normalize the features.. # o_min -> original_minimum # n_min -> new_minimum transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \
stringIndexerStages = [ StringIndexer(inputCol = col, outputCol = col + '_INDEX', handleInvalid = 'skip') for col in COLUMNS_OHE + COLUMNS_HIGH_CARD ] pipelineStages += stringIndexerStages OHEStage = OneHotEncoderEstimator( inputCols = [col + '_INDEX' for col in COLUMNS_OHE], outputCols = [col + '_VEC' for col in COLUMNS_OHE] ) pipelineStages += [OHEStage] sparseVectorCols = [col + '_VEC' for col in COLUMNS_OHE] + [col + '_INDEX' for col in COLUMNS_HIGH_CARD] assembler = VectorAssembler( inputCols = sparseVectorCols, outputCol = 'features' ) pipelineStages += [assembler] normalizer = Normalizer( inputCol = 'features', outputCol = 'normFeatures' ) pipelineStages += [normalizer] decisionTree = DecisionTreeClassifier( featuresCol = 'normFeatures', labelCol = 'HasDetections' ) pipelineStages += [decisionTree]
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label") label_model = label_stringIdx.fit(dataset) label_indexed = label_model.transform(dataset) print label_indexed.take(1) # COMMAND ---------- # MAGIC %md # MAGIC Next, we will use the VectorAssembler() to combine all the feature columns into a single vector column. This will include both the numeric columns and the one-hot encoded binary vector columns in our dataset. # COMMAND ---------- # Transform all features into a vector using VectorAssembler assembler = VectorAssembler( inputCols=["age","workclassclassVec","fnlwgt","educationclassVec","education_num","marital_statusclassVec", "occupationclassVec","relationshipclassVec","raceclassVec", "sexclassVec", "capital_gain", "capital_loss", "hours_per_week", "native_countryclassVec"], outputCol="features") output = assembler.transform(label_indexed) # Keep relevant columns selectedcols = ["label", "features"] + cols dataset = output.select(selectedcols) display(dataset) # COMMAND ---------- ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) print trainingData.count() print testData.count()
with open(json_file_path, 'r') as j: contents = json.load(j) cluster = contents['cluster'] for item in cluster: path_aggregated_df = item['path_aggregated_df'] path_metrics_kmeans_sse = item['path_metrics_kmeans_sse'] clustering_df = spark.read.parquet(path_aggregated_df) columns_clustering_features = [ "calls_outgoing_count", "user_spendings", "sms_incoming_count", "user_use_gprs", "sms_outgoing_count", "user_no_outgoing_activity_in_days", "calls_outgoing_spendings", "user_lifetime" ] print("before assemble") # duomenu paruosimas vector_assembler = VectorAssembler(inputCols=columns_clustering_features, outputCol="initial_features") standard_scaler = StandardScaler(inputCol="initial_features", outputCol="features", withStd=True, withMean=True) print("after scale") vectorized_df = vector_assembler.transform(clustering_df) model_scaler = standard_scaler.fit(vectorized_df) featurized_clustering_df = model_scaler.transform(vectorized_df) featurization_pipeline = Pipeline(stages=[vector_assembler, standard_scaler]) featurization_pipeline_model = featurization_pipeline.fit(clustering_df) model_scaler = featurization_pipeline_model.stages[-1] featurized_clustering_df = featurization_pipeline_model.transform( clustering_df) sse_cost = np.zeros(20)