def mapClickCategoricalFeatures(): indexed = "" df = getDataFrame(CLICKS_HDPFILEPATH) df.persist(StorageLevel.DISK_ONLY) print df.columns #select columns to be mapped click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"] for col in click_cols: if(indexed == ""): indexed = df print indexed outcol = col+"Index" indexer = StringIndexer(inputCol=col, outputCol=outcol) indexed = indexer.fit(indexed).transform(indexed) indexed.show() indexed.persist(StorageLevel.DISK_ONLY) #indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv") indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")
def train_random_forest(df): stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=int(random.random())) return rf, rf.fit(td)
def build_decisionTree(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show(truncate=False) dt = DecisionTreeClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def mapPublisherCategoricalFeatures(): indexed = "" df = getDataFrame(PUBLISHERS_HDPFILEPATH) df.persist(StorageLevel.DISK_ONLY) print df.columns publisher_cols = ["C0", "C1", "C2", "C3"] for col in publisher_cols: if(indexed == ""): indexed = df print indexed outcol = col+"Index" #stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe indexer = StringIndexer(inputCol=col, outputCol=outcol) #fit and transform the columns using indexer indexed = indexer.fit(indexed).transform(indexed) indexed.show() indexed.persist(StorageLevel.DISK_ONLY) indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")
def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def label(df, column): """ Create a labeled column. """ indexer = StringIndexer(inputCol=column, outputCol=column+'_label') df = indexer.fit(df).transform(df) return df
def indexStringColumns(df, cols): #variable newdf will be updated several times newdata = df for c in cols: si = StringIndexer(inputCol=c, outputCol=c+"-x") sm = si.fit(newdata) newdata = sm.transform(newdata).drop(c) newdata = newdata.withColumnRenamed(c+"-x", c) return newdata
def events(df,column_name): i = column_name+"I" v = column_name+"V" stringIndexer = StringIndexer(inputCol=column_name, outputCol=i) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=i, outputCol=v) encoded = encoder.transform(indexed) return encoded
def indexStringColumns(df, cols): from pyspark.ml.feature import StringIndexer #variable newdf will be updated several times newdf = df for c in cols: si = StringIndexer(inputCol=c, outputCol=c+"-num") sm = si.fit(newdf) newdf = sm.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-num", c) return newdf
def oneHotEncoding(self, df, input_col): stringInd = StringIndexer(inputCol=input_col, outputCol="indexed") model = stringInd.fit(df) td = model.transform(df) encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False) final_encoding = encoder.transform(td).select(df.id, 'features').cache() conv_udf = udf(lambda line: Vectors.dense(line).tolist()) final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache() return final_encoding
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([ (0, "a"), (1, "d"), (2, None)], ["id", "label"]) si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc") model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] self.assertEqual(actual2, expected2)
Instructions: Import the appropriate class and create an indexer object to transform the carrier column from a string to an numeric index. Prepare the indexer object on the flight data. Use the prepared indexer to create the numeric index column. Repeat the process for the org column. """ from pyspark.ml.feature import StringIndexer # Create an indexer indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the other categorical feature flights_indexed = StringIndexer( inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) """ Assembling columns The final stage of data preparation is to consolidate all of the predictor columns into a single column. At present our data has the following predictor columns: mon, dom and dow
irisNew_DF.show(5) # In[19]: #transforming dataframe by assigning labelIndex to every class of flower by using StringIndexer. #StringIndexer encodes a string column of labels to a column of label indices and can encode multiple columns. classlabel_indexer = StringIndexer(inputCol="label", outputCol="labelIndex") # In[20]: #applying above StringIndexer transformation to the dataFrame irisIndexer_DF = classlabel_indexer.fit(irisNew_DF).transform(irisNew_DF) # In[21]: #displaying first 5 records after StringIndexer transformation irisIndexer_DF.show(5) # In[22]: #defining logistic regression classifier model logReg_model = LogisticRegression(labelCol="labelIndex", featuresCol="features", maxIter=100, regParam=0.001,
] nazwy = ["Airline1_Back", 'Airline2_There', 'Airline2_Back', 'Airline1_There'] for country_from in country_list: for country_to in country_list: print("Country from: ", country_from, " Country to: ", country_to) try: df2 = df.filter(df.Country_from == country_from).filter( df.Country_to == country_to) df_temp = df2.select(df2.Scrap_time.cast("float"),'Airline1_Back','Airline2_There','Airline2_Back'\ ,'Airline1_There',df2.Days.cast("float"),df2.Journey_time.cast("float"), df2.Full_Price.cast("float")) for nazwa in nazwy: indexer = StringIndexer(inputCol=nazwa, outputCol=nazwa + "Index") df_temp = indexer.fit(df_temp).transform(df_temp) df_temp = df_temp.select('Airline1_BackIndex','Airline2_ThereIndex','Airline2_BackIndex','Airline1_ThereIndex','Scrap_time',\ 'Days','Journey_time', 'Full_Price') transformed = transData(df_temp) test = transformed.rdd.map(lambda row: LabeledPoint( row['label'], row['features'].toArray())) model = RandomForest.trainRegressor(test, categoricalFeaturesInfo={}, numTrees=30, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
# # Register data # spark_flights.createOrReplaceTempView("flights_temp") # # Data should appear # print(spark.catalog.listTables()) ################ # Spark StringIndexer() ################ # Only load carrier column carrier_df = spark_flights.select("carrier") # carrier_df.show(5) # Spark method of indexing string values with numerical values # Set up StringIndexer() carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index") # Transform data carr_indexed = carr_indexer.fit(carrier_df).transform(carrier_df) # carr_indexed.show(7) # Do a OneHotEncoder first and then add StringIndexer carrier_df_onehot = spark_flights.select("carrier") stringIndexer = StringIndexer(inputCol="carrier", outputCol="carrier_index") model = stringIndexer.fit(carrier_df_onehot) indexed = model.transform(carrier_df_onehot) encoder = OneHotEncoder(dropLast=False, inputCol="carrier_index", outputCol="carrier_vec") encoded = encoder.transform(indexed) encoded.show(7)
from pyspark.sql.types import StructField, StructType, LongType, StringType, IntegerType import pyspark.sql.functions as F from pyspark import SparkContext, SparkConf from tqdm import tqdm from itertools import permutations from collections import defaultdict import time from pyspark.ml.feature import StringIndexer spark = SparkSession.builder.appName("tag base on spark").master("local[8]").getOrCreate() sc = spark.sparkContext schema = StructType([StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', LongType(), True), StructField('timestamp', IntegerType(), True)]) tags = spark.read.csv(r'D:\Users\hao.guo\deepctr\recsys\movielen\ml-20m\tags.csv', header=True) index = StringIndexer(inputCol='tag', outputCol='tagid') model = index.fit(tags) tags = model.transform(tags) tags = tags.withColumn('tagid', tags['tagid'].cast('int')) tags_rdd = tags.select(['userId', 'movieId', 'tagid']).rdd train_rdd, test_rdd = tags_rdd.randomSplit([0.7, 0.3], seed=2020) train_rdd = train_rdd.cache() test_rdd = test_rdd.cache() train_rdd = train_rdd.map(lambda s: (s, 1)).
def build_recommendation_model(self): logging.info("getting distinct users") print_with_time("getting distinct users") users = self.df.select(["user_id"]).distinct() logging.info("getting distinct items") print_with_time("getting distinct items") items = self.df.select(["item_id"]).distinct() logging.info("mapping user_id to number") print_with_time("mapping user_id to number") user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_no") self.user_indexed = user_indexer.fit(users).transform(users) self.user_indexed = self.user_indexed.select( self.user_indexed.user_id.cast("string"), self.user_indexed.user_id_no.cast("int")) logging.info("mapping item_id to number") print_with_time("mapping item_id to number") item_indexer = StringIndexer(inputCol="item_id", outputCol="item_id_no") self.item_indexed = item_indexer.fit(items).transform(items) self.item_indexed = self.item_indexed.select( self.item_indexed.item_id.cast("string"), self.item_indexed.item_id_no.cast("int")) logging.info("joining df with user_indexed rdd") print_with_time("joining df with user_indexed rdd") self.df = self.df.join(self.user_indexed, ["user_id"], 'inner') logging.info("joining df with item_indexed rdd") print_with_time("joining df with item_indexed rdd") self.df = self.df.join(self.item_indexed, ["item_id"], 'inner') self.df = self.df.select(["item_id_no", "user_id_no", "rating"]) ############ logging.info("splitting dataset into training and testing") print_with_time("splitting dataset into training and testing") (training, validation, test) = self.df.randomSplit([0.6, 0.2, 0.2]) ###### ranks = [25, 50, 100] regParam = [0.1, 0.01, 0.001] all_params = [(rank, reg) for rank in ranks for reg in regParam] min_mpr = float('inf') best_rank = -1 best_reg = -1 for (iteration_no, (rank, reg)) in enumerate(all_params): logging.info(iteration_no) print_with_time(str(iteration_no)) logging.info("rank=%s, reg=%s " % (rank, reg)) print_with_time("rank=%s, reg=%s " % (rank, reg)) als = ALS(rank=rank, regParam=reg, nonnegative=True, implicitPrefs=True, userCol="user_id_no", itemCol="item_id_no", checkpointInterval=-1, coldStartStrategy="drop", ratingCol="rating") self.model = als.fit(training) logging.info("transforming the validation set") print_with_time("transforming the validation set") predictions = self.model.transform(validation) logging.info("getting rmse on validation set") print_with_time("getting rmse on validation set") evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logging.info("Root-mean-square error = " + str(rmse)) print_with_time("Root-mean-square error = " + str(rmse)) logging.info("getting MPR on validation set") print_with_time("getting MPR on validation set") ev = RankBasedEvaluator2("user_id_no", "rating", "prediction") mpr = ev.evaluate(sqlContext, predictions) logging.info("Mean Percentile Ranking = " + str(mpr)) print_with_time("Mean Percentile Ranking = " + str(mpr)) if mpr < min_mpr: min_mpr = mpr best_rank = rank best_reg = reg logging.info('The best model was trained with rank %s and reg %s' % (best_rank, best_reg)) print_with_time('The best model was trained with rank %s and reg %s' % (best_rank, best_reg)) ###### logging.info("starting model training") print_with_time("starting model training") als = ALS(rank=best_rank, regParam=best_reg, nonnegative=True, implicitPrefs=True, userCol="user_id_no", itemCol="item_id_no", checkpointInterval=-1, coldStartStrategy="drop", ratingCol="rating") self.model = als.fit(training) logging.info("transforming the test set") print_with_time("transforming the test set") predictions = self.model.transform(test) logging.info("getting rmse on test set") print_with_time("getting rmse on test set") evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logging.info("Root-mean-square error = " + str(rmse)) print_with_time("Root-mean-square error = " + str(rmse)) logging.info("getting MPR on test set") print_with_time("getting MPR on test set") ev = RankBasedEvaluator2("user_id_no", "rating", "prediction") mpr = ev.evaluate(sqlContext, predictions) logging.info("Mean Percentile Ranking = " + str(mpr)) print_with_time("Mean Percentile Ranking = " + str(mpr))
# will be creating a new ML dataframe from the combined dataframe above from pyspark.ml.feature import StringIndexer interested_cols_ML = spark.sql( """SELECT DISTINCT state, date, restriction_end_date_of_april28, religious_restrictions, current_restrictions, m50, m50_index, confirmed_cases AS cases, fatalities AS fatalities, (confirmed_cases / current_population) AS cases_density, (fatalities / current_population) AS fatality_density FROM combined ORDER BY state, date""" ) interested_cols_ML.createOrReplaceTempView("interested_cols_ML") interested_cols_ML.show(3) # COMMAND ---------- # turning string categorical variables back into integers lblIndxr = StringIndexer().setInputCol("religious_restrictions").setOutputCol( "label_religious_rest") idxRes = lblIndxr.fit(interested_cols_ML).transform(interested_cols_ML) lblIndxr2 = StringIndexer().setInputCol("current_restrictions").setOutputCol( "label_curr_rest") idxRes2 = lblIndxr2.fit(idxRes).transform(idxRes) # tried using for loop, ended up taking just as long # indexers = [StringIndexer(inputCol=column, outputCol=column+"_label").fit(interested_cols_ML).transform(interested_cols_ML) for column in interested_cols_ML.columns if "_restrictions" in column ] # COMMAND ---------- cols_drop = [ 'fatality_density', 'religious_restrictions', 'current_restrictions' ] final_cols_df = idxRes2.drop(*cols_drop) final_cols_df.show(3) # COMMAND ----------
from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField,DoubleType schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)]) features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol()) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') lr_model = lr.fit(featIndexed) dfTestTok = tokenizer.transform(dfTest) featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) testIndexed = string_indexer_model.transform(featuresTest)
#The features column will now contain values from all the input columns identified assemble_apply=Vec_assembler.transform(a) # assemble_apply.show(10) #Since we have integrated the defining features into the features above we don't require them any further so we can remove them from our df assembly_final=assemble_apply.drop('sepal_length','sepal_width','petal_length','petal_width') # assembly_final.show(5) #Adding a labelindex for our defining class which in this case is species. #We would get the label index as 0 1 2 depending on the frequency of occurence with 0 being awarded to the highest occuring species label=StringIndexer(inputCol='species',outputCol='label') si_dataset_fit=label.fit(assembly_final).transform(assembly_final) # si_dataset_fit.show(5) #We are now dividing our data set into two parts. the training and the test to see how accurate our model is #Going with the 80-20 split here. 80 for training and 20 for testing train_data,test_data=si_dataset_fit.randomSplit([0.8,0.2]) #Using logistic regression rg=0.03 #Can be changed depending on what value yield more accurate results. lr =LogisticRegression(featuresCol='features',labelCol='label',regParam=rg) model = lr.fit(train_data) #Fitting the data for linear regression #Actually seeing how it is performing using by testing it using the test data prediction=model.transform(test_data)
df.count() # ### a. Prepare in Input Features # # First, you will need to prepare each of the input features. While age is a numeric feature, state and name are not. These need to be converted into numeric vectors before you can train the model. Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors. Use the VectorAssembler to combine the name, state, and age vectors into a single features vector. Your final dataset should contain a column called features containing the prepared vector and a column called label containing the sex of the person. # # # #### Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors # In[6]: name_indexer = StringIndexer(inputCol="name", outputCol="nameInd") name_trsf = name_indexer.fit(df).transform(df) # transform(df.select("name")) name_ohe = OneHotEncoder(inputCol="nameInd", outputCol="name_ohe") name_featurevect = name_ohe.transform(name_trsf) # In[7]: name_featurevect # In[8]: state_indexer = StringIndexer(inputCol="state", outputCol="stateInd") state_trsf = state_indexer.fit(name_featurevect).transform( name_featurevect) # transform(df.select("state")) state_ohe = OneHotEncoder(inputCol="stateInd", outputCol="state_ohe") state_featurevect = state_ohe.transform(state_trsf)
return result conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) data = sc.textFile("/home/bigdatalab28/test.sql") data = data.filter(lambda line: line != '') data = data.map(lambda line: line.split("\t")) schemaVal = data.map(lambda x: (x[3], x[7], x[9])).map( lambda x: Row(label_0=x[0], birth_city=x[1], id_city=x[2])) schemaVal = sqlContext.createDataFrame(schemaVal) (train_data, valid_data, test_data) = schemaVal.randomSplit([0.7, 0.1, 0.2], 123) indexer = StringIndexer(inputCol="label_0", outputCol="label") indexed = indexer.fit(train_data).transform(train_data) indexer = StringIndexer(inputCol="birth_city", outputCol="bc") indexed = indexer.fit(indexed).transform(indexed) indexer = OneHotEncoder(inputCol="bc", outputCol="bc_one") indexed = indexer.transform(indexed) indexer = StringIndexer(inputCol="id_city", outputCol="ic") indexed = indexer.fit(indexed).transform(indexed) indexer = OneHotEncoder(inputCol="ic", outputCol="ic_one") indexed = indexer.transform(indexed) assembler = VectorAssembler(inputCols=["ic_one", "bc_one"], outputCol="features") train = assembler.transform(indexed) nb = NaiveBayes(smoothing=1.0) model = nb.fit(train) indexer = StringIndexer(inputCol="label_0", outputCol="label") indexed = indexer.fit(train_data).transform(train_data)
# Check the buckets out ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) ml_bucketized_features = string_indexer.fit(ml_bucketized_features)\ .transform(ml_bucketized_features) # Check out the indexes ml_bucketized_features.show(6) # Handle continuous, numeric fields by combining them into one feature vector numeric_columns = ["DepDelay", "Distance"] index_columns = ["Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
from pyspark.sql.functions import expr sc = SparkContext("local", "Spark Pipeline") sqlContext = SQLContext(sc) df = sqlContext.read.csv("../data/titanic.csv", sep="\t", header=True, inferSchema=True) train, test = df.randomSplit([0.7, 0.3], seed=12345) mapping = sqlContext.createDataFrame([(0, "male"), (1, "female")], ["id", "category"]) indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex") train = indexer.fit(train).transform(train) train.show() percentiles = train.approxQuantile("Fare", [0.01, 0.99], 0.01) winsorize = expr("""IF(Fare >= {}, {},IF(Fare <= {},{},Fare))""".format( percentiles[0], percentiles[0], percentiles[1], percentiles[1])) train.withColumn("Fare", winsorize) train.show() imputer = Imputer(inputCols=["Age", "Fare"], outputCols=["out_Age", "out_Fare"]).setStrategy("median") train = imputer.fit(train).transform(train) train.show()
print "Creating feature vectors" t0 = time() dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) tt = time() - t0 print "Dataframe created in {} second".format(round(tt,3)) # In[19]: print "Indexing labels" t0 = time() from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainVec) dfTrainIdx = string_indexer_model.transform(dfTrainVec) dfTrainIdx.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[20]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='featureVectors', labelCol='target_indexed', maxDepth=10) # In[21]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator
full_rdd = sub_acc.union(sub_noacc) full_rdd.cache() #create a dataframe for encoding categorical variables df = sqlContext.createDataFrame(full_rdd) #complete dataset #df = sqlContext.createDataFrame(full_rdd.sample(withReplacement=False,fraction=0.25,seed=seed)) #let's start with a 1/4 of the data # ## Convert categorical features # # The following cells train a Spark StringIndexer to index the zip codes in the data set. #define categorical indexers for the data zipIndexer = StringIndexer(inputCol='grid_zipcode', outputCol='grid_zipcodeIdx')#,handleInvalid='skip') zipIdxModel = zipIndexer.fit(df) indexed = zipIdxModel.transform(df) indexed.cache() #zipEncoder = OneHotEncoder(dropLast=False, inputCol="grid_zipcodeIdx", outputCol="grid_zipcodeVec") #zipEncoded = zipEncoder.transform(td1) #save the zip code labels for rewriting predictions to Elasticsearch index zipCodeLables = zipIdxModel._call_java("labels") zipKey = {i:zipCodeLables[i] for i in range(len(zipCodeLables))} # ## Labeled Points # # Spark MLLib algorithms take LabeledPoints, a special object tuple of (label, [features]). Before training the model we will run a simple map job to convert the SparkSQL DataFrame rows to LabeledPoints.
features = sqc.read.parquet(input_features) features = features.filter(features['cls']!='None')\ .select(['cls', 'features'])\ .cache() print features features = sqc.createDataFrame(features.map(normalizer)) print features training, valid = features.randomSplit([0.75, 0.25]) labelIndexer = StringIndexer(inputCol="cls", outputCol="label") model = labelIndexer.fit(training) training = model.transform(training).rdd.map(lambda row: LabeledPoint(row.label, row.features)) valid = model.transform(valid).rdd.map(lambda row: LabeledPoint(row.label, row.features)) print training.first() #lr = LogisticRegression() #pipeline = Pipeline(stages=[labelIndexer,lr]) # fit model = LogisticRegressionWithLBFGS.train(training, numClasses=10) #model = pipeline.fit(training) # ecaluate #evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
spark = SparkSession.builder.appName('treecode').getOrCreate() data = spark.read.csv('College.csv', inferSchema=True, header=True) from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate' ], outputCol="features") output = assembler.transform(data) from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex") output_fixed = indexer.fit(output).transform(output) final_data = output_fixed.select("features", 'PrivateIndex') train_data, test_data = final_data.randomSplit([0.7, 0.3]) from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier from pyspark.ml import Pipeline dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features') rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features') gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features') dtc_model = dtc.fit(train_data) rfc_model = rfc.fit(train_data) gbt_model = gbt.fit(train_data) dtc_predictions = dtc_model.transform(test_data) rfc_predictions = rfc_model.transform(test_data) gbt_predictions = gbt_model.transform(test_data) from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["user_id", "category"]) indexer = StringIndexer(inputCol='category', outputCol='categoryIndex') indexed = indexer.fit(df).transform(df) indexed.show() from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import VectorAssembler df = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) df.show() assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(df) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show()
# create a table name to use for queries #dfpfc.createOrReplaceTempView("census07") # run a query #fcout=myspark.sql('select * from census07 where salary > 100000') #fcout.show(5) # create a dataframe with valid rows mydf=myspark.sql('select code as txtlabel, salary, total_emp from sample_07 where total_emp > 0 and total_emp< 1000000 and salary >0 and salary<500000' ) mydf.show(5) # need to convert from text field to numeric # this is a common requirement when using sparkML from pyspark.ml.feature import StringIndexer # this will convert each unique string into a numeric indexer = StringIndexer(inputCol="txtlabel", outputCol="label") indexed = indexer.fit(mydf).transform(mydf) indexed.show(5) # now we need to create a "label" and "features" # input for using the sparkML library from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vectors assembler = VectorAssembler( inputCols=[ "total_emp","salary"], outputCol="features") output = assembler.transform(indexed) # note the column headers - label and features are keywords print ( output.show(3) ) # use the kmeans clustering - do not write it yourself :-)
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls), flush=True) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/arrival_bucketizer_2.0.bin".format( MODELS_MOUNTPATH) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/string_indexer_model_{}.bin".format( MODELS_MOUNTPATH, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/numeric_vector_assembler.bin".format( MODELS_MOUNTPATH) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/spark_random_forest_classifier.flight_delays.5.0.bin".format( MODELS_MOUNTPATH) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy), flush=True) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
#SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint) lpUSD.take(5) dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"]) dfUSD.select("label", "features").show(10) #String Indexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(dfUSD) td = si_model.transform(dfUSD) td.collect() td.show() #Splitting data (trainingData, testData) = td.randomSplit([0.6, 0.4]) trainingData.count() testData.count() testData.collect() #Creating decision tree model dtClassifer = DecisionTreeClassifier(labelCol="indexed", minInstancesPerNode=1500) dtModel = dtClassifer.fit(trainingData) dtModel.numNodes
spark = SparkSession.builder.appName('lr_ex').getOrCreate() base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/' file_name = 'cruise_ship_info.csv' data = spark.read.csv(base_path + file_name, inferSchema=True, header=True) data.printSchema() data.select(corr('crew', 'passengers')).show() #print(data.columns) #print(data.groupBy('Cruise_line').count()) # This one transforms the strings into numbers indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_cat') indexed = indexer.fit(data).transform(data) inCols = [ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'Cruise_cat' ] # Including Cruse_cat makes things worse ?!?! #inCols = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'] assembler = VectorAssembler(inputCols=inCols, outputCol='features') output = assembler.transform(indexed) indexed.show() final_data = output.select('features', 'crew') train_data, test_data = final_data.randomSplit([0.6, 0.4])
from pyspark.ml.feature import IndexToString, StringIndexer from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("IndexToStringExample") \ .getOrCreate() df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print( "Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" %
def run_similar(mysql_user, mysql_pwd, mysql_host, mysql_db, kaiguan=1): sc = SparkContext(appName="calculate similar matrix", master="spark://master:7077") sqlContext = SQLContext(sc) # 创建连接获取数据 # DataFrame df_movieinfo = sqlContext.read.format("jdbc")\ .option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\ .option("dbtable", "movies_movieinfo")\ .option("user", mysql_user)\ .option("password",mysql_pwd)\ .load() stringIndexer = StringIndexer(inputCol="directors", outputCol="director_Index") model = stringIndexer.fit(df_movieinfo) indexed = model.transform(df_movieinfo) encoder = OneHotEncoder(inputCol="director_Index", outputCol="direcVec") encoded = encoder.transform(indexed) encoded.select('direcVec').show() # 根据python的返回值类型定义好spark对应的数据类型 # python函数中返回的是string,对应的pyspark是StringType segUDF = psf.UserDefinedFunction(seg, StringType()) # 使用withColumn函数增加列 df_seg = df_movieinfo.withColumn('description_2', segUDF('description')) # word2vec(df_movieinfo, "description", "result") #3.使用tokenizer分词 tokenizer = Tokenizer(inputCol="description_2", outputCol="words") t_words = tokenizer.transform(df_seg) if kaiguan == 0: hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100) featurizedData = hashingTF.transform(t_words) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) normalizer = Normalizer(inputCol="features", outputCol="norm", p=2.0) dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) rescaledData = idfModel.transform(featurizedData) df_norm = normalizer.transform(rescaledData) # ?? similarity_idf = df_norm.alias("item1").join(df_norm.alias("item2"), psf.col("item1.ID") < psf.col("item2.ID"))\ .select( psf.col("item1.ID").alias("item1"), psf.col("item2.ID").alias("item2"), dot_udf("item1.norm", "item2.norm").alias("similar"))\ .sort("item1", "item2") # 创建连接写入数据 similarity_idf.write.format("jdbc").option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\ .option("dbtable", "xxxxxxxxx").option("user", mysql_user).option("password",mysql_pwd).mode('append').save() elif kaiguan == 1: #4.将文本向量转换成稀疏表示的数值向量(字符频率向量) cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=5, minDF=2.0) cv_model = cv.fit(t_words) cv_result = cv_model.transform(t_words) #5.将tokenizer得到的分词结果转换数字向量 word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="words", outputCol="result") w2v_model = word2Vec.fit(cv_result) result = w2v_model.transform(cv_result) normalizer = Normalizer(inputCol="result", outputCol="norm", p=2.0) data = normalizer.transform(result) dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) similarity_w2v = data.alias("item1").join(data.alias("item2"), psf.col("item1.ID") < psf.col("item2.ID"))\ .select( psf.col("item1.ID").alias("item1"), psf.col("item2.ID").alias("item2"), dot_udf("item1.norm", "item2.norm").alias("dot"))\ .sort("item1", "item2") # 创建连接写入数据 similarity_w2v.write.format("jdbc")\ .option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\ .option("dbtable", "movies_moviesimilar_fromspark")\ .option("user", mysql_user)\ .option("password",mysql_pwd)\ .mode('append').save()
data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), 'class_'+str(line[0]),int(line[0])) ) else: # Test data gets dummy labels. We need the same structure as in Train data data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) return sqlcontext.createDataFrame(data, ['features', 'category','label']) train_df = load_data_frame("train.csv") test_df = load_data_frame("test.csv", shuffle=False, train=False) from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol="category", outputCol="index_category") fitted_indexer = string_indexer.fit(train_df) indexed_df = fitted_indexer.transform(train_df) from distkeras.transformers import * from pyspark.ml.feature import OneHotEncoder ####OneHot nb_classes = 9 encoder = OneHotTransformer(nb_classes, input_col='label', output_col="label_encoded") dataset_train = encoder.transform(indexed_df) dataset_test = encoder.transform(test_df) ###encoder from pyspark.ml.feature import MinMaxScaler transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \ o_min=0.0, o_max=250.0, \ input_col="features", \
from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=X_col, outputCol=Y_col) from pyspark.ml.feature import OneHotEncoder, StringIndexer encoder = OneHotEncoder(inputCol="indexed", outputCol="features") df = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") ], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) indexed.printSchema() indexed.take(1) indexed.take(5) train_Y.printSchema() train_X.printSchema() train_X.schema.json() train_X.columns spark_df.columns import json data_schema = json.loads(train_X.schema.json()) isinstance(data_schema, dict) StringIndexer(inputCol='x1', outputCol='indexed_x1')
# $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$
# Create spark session spark = SparkSession.builder.appName("ICP7").getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Define input path input_path = "C:\\Users\\Lenovo\\PycharmProjects\\M2_ICP7" # Load data and select feature and label columns data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load(input_path + "\\adult.csv") data = data.withColumnRenamed("age", "label").select("label", col("education-num").alias("education-num"), col(" hours-per-week").alias("hours-per-week"),col(" education").alias("education"),col(" fnlwgt").alias("fnlwgt"),col(" sex").alias("sex"),col(" relationship").alias("relationship")) data = data.select(data.label.cast("double"),"education-num", "hours-per-week","education","sex","fnlwgt","relationship") new_data=data.toDF("label","education-num","hours-per-week","education","sex","fnlwgt","relationship") indexer = StringIndexer(inputCol="education", outputCol="new_education") indexed = indexer.fit(new_data).transform(new_data) indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex") indexed1 = indexer1.fit(indexed).transform(indexed) indexer2= StringIndexer(inputCol="relationship",outputCol="new_rel") indexed2= indexer2.fit(indexed1).transform(indexed1) indexed2=indexed2.drop("sex","education","relationship") indexed2.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features") data = assembler.transform(indexed2)
rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError) if __name__ == "__main__": if len(sys.argv) > 1: print("Usage: gradient_boosted_trees", file=sys.stderr) exit(1) sc = SparkContext(appName="Jay") sqlContext = SQLContext(sc) # Load and parse the data file into a dataframe. df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") si_model = stringIndexer.fit(df) td = si_model.transform(df) [train, test] = td.randomSplit([0.7, 0.3]) testClassification(train, test) testRegression(train, test) sc.stop()
cluster = Cluster(['192.168.246.236']) session = cluster.connect("dev") sc = SparkContext(conf=conf) sql = SQLContext(sc) spark = SparkSession(sc) print("SparkContext => ", sc) print("SQLContext => ", sql) stations = sql.read.format("org.apache.spark.sql.cassandra").load( keyspace="dev", table="station") clean_data = sql.read.format("org.apache.spark.sql.cassandra").load( keyspace="dev", table="clean_daily_measurement") stationsIds = getStationsIds(stations) stationCount = 1 ##Make a join clean_data with stations in order to get the province joinedData = clean_data.join(stations, ["station_id"]) indexer = StringIndexer(inputCol="province", outputCol="stationIndex") indexed = indexer.fit(joinedData).transform(joinedData) doBayes(indexed) print("--- %s seconds ---" % (time.time() - start_time)) print("END!!!") sc.stop()
# In[18]: from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator print "Fitting the classifier on bigram features" t0 = time() string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') lr = LogisticRegression(featuresCol='bigramVectors',labelCol='target_indexed',maxIter=30, regParam=0.01) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') string_indexer_model = string_indexer.fit(dfBigram) dfTrainIndexed = string_indexer_model.transform(dfBigram).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[19]: print "Testing precision of the model" t0 = time() dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(['bigramVectors','label']).cache() dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache() df_valid_pred = lrModel.transform(dfValidIndexed).cache()
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
mc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label") cv = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=mc, numFolds=2) # for row in train_df.rdd.collect(): # print("row: ", row.uri) # load_image_from_uri(row.uri) # cvModel = cv.fit(train_df) # mc.evaluate(cvModel.transform(test_df)) # stringIndexer = StringIndexer(inputCol="label_name", outputCol="categoryIndex") indexed_dateset = stringIndexer.fit(train_df).transform(train_df) # encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") encoder = OneHotEncoderEstimator(inputCols=["categoryIndex"], outputCols=["categoryVec"]) encoder_model = encoder.fit(indexed_dateset) image_dataset = encoder_model.transform(indexed_dateset) image_dataset.show() transformers = estimator.fit(image_dataset)
False if r.attributes['Good For'] is None else r.attributes['Good For']['dinner'], False if r.attributes['Good For'] is None else r.attributes['Good For']['lunch'], False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'], False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True, False if r.attributes['Take-out'] is None else r.attributes['Take-out']] ).toDF(clustering_columns) # drop row with null values lv_clustering_data = lv_clustering_data.dropna() #Neighborhood feature engineering stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index") lv_model = stringIndexer.fit(lv_clustering_data) lv_indexed = lv_model.transform(lv_clustering_data) encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec") lv_encoded = encoder.transform(lv_indexed) #initial feature set # assembler = VectorAssembler( # inputCols=["stars", "price_range", "neigh_vec"], # outputCol="features_vec") #expanded feature set feature_columns = clustering_columns[2:] feature_columns.append("neigh_vec") assembler = VectorAssembler( inputCols=feature_columns, outputCol="features_vec")
def main(): spark = SparkSession.builder.appName("PySparkTitanic").getOrCreate() args = getResolvedOptions(sys.argv, ['s3_input_data_location', 's3_output_bucket', 's3_output_bucket_prefix', 's3_model_bucket', 's3_model_bucket_prefix']) # This is needed to write RDDs to file which is the only way to write nested Dataframes into CSV. spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") train = spark.read.csv(args['s3_input_data_location'], header=False) oldColumns = train.schema.names newColumns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cat'] train = reduce(lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), train) # dropping null values train = train.dropna() # Target label catIndexer = StringIndexer(inputCol="cat", outputCol="label") labelIndexModel = catIndexer.fit(train) train = labelIndexModel.transform(train) converter = IndexToString(inputCol="label", outputCol="cat") # Spliting in train and test set. Beware : It sorts the dataset (traindf, validationdf) = train.randomSplit([0.8, 0.2]) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. buyingIndexer = StringIndexer(inputCol="buying", outputCol="indexedBuying") maintIndexer = StringIndexer(inputCol="maint", outputCol="indexedMaint") doorsIndexer = StringIndexer(inputCol="doors", outputCol="indexedDoors") personsIndexer = StringIndexer(inputCol="persons", outputCol="indexedPersons") lug_bootIndexer = StringIndexer(inputCol="lug_boot", outputCol="indexedLug_boot") safetyIndexer = StringIndexer(inputCol="safety", outputCol="indexedSafety") # One Hot Encoder on indexed features buyingEncoder = OneHotEncoder(inputCol="indexedBuying", outputCol="buyingVec") maintEncoder = OneHotEncoder(inputCol="indexedMaint", outputCol="maintVec") doorsEncoder = OneHotEncoder(inputCol="indexedDoors", outputCol="doorsVec") personsEncoder = OneHotEncoder(inputCol="indexedPersons", outputCol="personsVec") lug_bootEncoder = OneHotEncoder(inputCol="indexedLug_boot", outputCol="lug_bootVec") safetyEncoder = OneHotEncoder(inputCol="indexedSafety", outputCol="safetyVec") # Create the vector structured data (label,features(vector)) assembler = VectorAssembler(inputCols=["buyingVec", "maintVec", "doorsVec", "personsVec", "lug_bootVec", "safetyVec"], outputCol="features") # Chain featurizers in a Pipeline pipeline = Pipeline(stages=[buyingIndexer, maintIndexer, doorsIndexer, personsIndexer, lug_bootIndexer, safetyIndexer, buyingEncoder, maintEncoder, doorsEncoder, personsEncoder, lug_bootEncoder, safetyEncoder, assembler]) # Train model. This also runs the indexers. model = pipeline.fit(traindf) # Delete previous data from output s3 = boto3.resource('s3') bucket = s3.Bucket(args['s3_output_bucket']) bucket.objects.filter(Prefix=args['s3_output_bucket_prefix']).delete() # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = model.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'train') # Similar data processing for validation dataset. predictions = model.transform(validationdf) transformed_train_rdd = predictions.rdd.map(lambda x: (x.label, x.features)) lines = transformed_train_rdd.map(toCSVLine) lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'validation') # Serialize and store via MLeap SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", predictions) # Unzipping as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = args['s3_model_bucket_prefix'] + '/' + 'model.tar.gz' s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/model.tar.gz', file_name) os.remove('/tmp/model.zip') os.remove('/tmp/model.tar.gz') shutil.rmtree('/tmp/model') # Save postprocessor SimpleSparkSerializer().serializeToBundle(converter, "jar:file:/tmp/postprocess.zip", predictions) with zipfile.ZipFile("/tmp/postprocess.zip") as zf: zf.extractall("/tmp/postprocess") # Writing back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/postprocess.tar.gz", "w:gz") as tar: tar.add("/tmp/postprocess/bundle.json", arcname='bundle.json') tar.add("/tmp/postprocess/root", arcname='root') file_name = args['s3_model_bucket_prefix'] + '/' + 'postprocess.tar.gz' s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/postprocess.tar.gz', file_name) os.remove('/tmp/postprocess.zip') os.remove('/tmp/postprocess.tar.gz') shutil.rmtree('/tmp/postprocess')
def analyze(sc, train_path, test_path): train_rdd = sc.textFile(train_path) test_rdd = sc.textFile(test_path) train_df = parseTrain(train_rdd) test_df = parseTest(test_rdd) train_df = train_df.withColumn('Mark', lit('train')) test_df = (test_df.withColumn('Survived', lit(0)).withColumn('Mark', lit('test'))) test_df = test_df[train_df.columns] ## Append Test data to Train data df = train_df.unionAll(test_df) df = (df.withColumn('Age', df['Age'].cast('double')).withColumn( 'SibSp', df['SibSp'].cast('double')).withColumn( 'Parch', df['Parch'].cast('double')).withColumn( 'Fare', df['Fare'].cast('double')).withColumn( 'Survived', df['Survived'].cast('double'))) df.printSchema() numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare'] missing = {var: countNull(df, var) for var in numVars} age_mean = df.groupBy().mean('Age').first()[0] fare_mean = df.groupBy().mean('Fare').first()[0] df = df.na.fill({'Age': age_mean, 'Fare': fare_mean}) ## created user defined function to extract title getTitle = udf(lambda name: name.split('.')[0].strip(), StringType()) df = df.withColumn('Title', getTitle(df['Name'])) df.select('Name', 'Title').show(3) catVars = ['Pclass', 'Sex', 'Embarked', 'Title'] si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed') df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed( 'Sex_indexed', 'Sex') def indexer(df, col): si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df) return si indexers = [indexer(df, col) for col in catVars] pipeline = Pipeline(stages=indexers) df_indexed = pipeline.fit(df).transform(df) df_indexed.select('Embarked', 'Embarked_indexed').show(10) catVarsIndexed = [i + '_indexed' for i in catVars] featuresCol = numVars + catVarsIndexed featuresCol.remove('Survived') labelCol = ['Mark', 'Survived'] row = Row('mark', 'label', 'features') df_indexed = df_indexed[labelCol + featuresCol] # 0-mark, 1-label, 2-features # map features to DenseVector lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1], DenseVector(r[2:])))).toDF() # index label # convert numeric label to categorical, which is required by # decisionTree and randomForest lf = StringIndexer(inputCol='label', outputCol='index').fit(lf).transform(lf) lf.show(3) train = lf.where(lf.mark == 'train') test = lf.where(lf.mark == 'test') # random split further to get train/validate train, validate = train.randomSplit([0.7, 0.3], seed=121) print('Train Data Number of Row: ' + str(train.count())) print('Validate Data Number of Row: ' + str(validate.count())) print('Test Data Number of Row: ' + str(test.count())) lr = LogisticRegression(maxIter=100, regParam=0.05, labelCol='index').fit(train) # Evaluate model based on auc ROC(default for binary classification) def testModel(model, validate=validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol='index') return evaluator.evaluate(pred) print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr))) dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train) rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train) models = { 'LogisticRegression': lr, 'DecistionTree': dt, 'RandomForest': rf } modelPerf = {k: testModel(v) for k, v in models.iteritems()} print(modelPerf)
# Create ngrams of size 2 myngram = NGram(inputCol="stopRemoved", outputCol="ngrams", n=2) data = myngram.transform(data) data = data.withColumn('ngrams', data.ngrams.cast(ArrayType(StringType(), True))) # Apply count vectorizer to convert to vector of counts of the ngrams myCountVectorizer = CountVectorizer(inputCol="ngrams", outputCol="countVect", minDF=1.0) data = myCountVectorizer.fit(data).transform(data) # Transform the label using StringINdexer si_label = StringIndexer(inputCol="label", outputCol="label2", handleInvalid="skip") data = si_label.fit(data).transform(data) data.drop('label') data = data.withColumn('label', data.label2) # Divide into training and test data trainData = data[data['Date'] < '20150101'] testData = data[data['Date'] >= '20141231'] # define the random forest classifier model rf = RandomForestClassifier(labelCol="label", featuresCol="countVect", numTrees=3, maxDepth=4, maxBins=200) # perform a grid search on a set of parameter values grid = ParamGridBuilder().addGrid(rf.numTrees, [2, 5])\ .addGrid(rf.maxDepth, [2, 5])\
pandas_df['week'] = pandas_df['Dates'].dt.weekofyear pandas_df['x_sim'] = pandas_df['X'].str[1:8] pandas_df['X'] = pandas_df['X'].str[1:8] pandas_df['y_sim'] = pandas_df['Y'].str[0:6] pandas_df['X'] = pd.to_numeric(pandas_df['X']) pandas_df['Y'] = pd.to_numeric(pandas_df['Y']) pandas_df['x_sim'] = pd.to_numeric(pandas_df['x_sim']) pandas_df['y_sim'] = pd.to_numeric(pandas_df['y_sim']) #send back to the RDD data_df = sqlContext.createDataFrame(pandas_df) #encode the police dept as a feature stringIndexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Index") model = stringIndexer.fit(data_df) indexed = model.transform(data_df) encoder = OneHotEncoder(dropLast=False, inputCol="PdDistrict_Index", outputCol="pd") encoded = encoder.transform(indexed) #remove data_df from memory data_df.unpersist() #encode the dependent variable - category_predict classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index") classifymodel = classifyIndexer.fit(encoded) encoded2 = classifymodel.transform(encoded) #keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim
strim(df2._c13).cast("double").alias("weight")) \ .withColumn("grade", functions.lit("high")) \ .withColumn("gender", functions.lit("woman")) df9 = df3.union(df4).union(df5).union(df6).union(df7).union(df8) # 연도, 키, 몸무게, 학년, 성별 df9.show(5, False) df9.printSchema() # 문자열 컬럼을 double로 변환 gradeIndexer = StringIndexer(inputCol="grade", outputCol="gradecode") genderIndexer = StringIndexer(inputCol="gender", outputCol="gendercode") df10 = gradeIndexer.fit(df9).transform(df9) df11 = genderIndexer.fit(df10).transform(df10) df11.show(3, False) df11.printSchema() assembler = VectorAssembler(inputCols=["height", "gradecode", "gendercode"], outputCol="features") df12 = assembler.transform(df11) df12.show(truncate=False) samples = df12.randomSplit([0.7, 0.3]) training = samples[0] test = samples[1]
WHEN (pickup_hour <= 6 OR pickup_hour >= 20) THEN "Night" WHEN (pickup_hour >= 7 AND pickup_hour <= 10) THEN "AMRush" WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon" WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush" END as TrafficTimeBins FROM taxi_test """ taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement) ## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY taxi_df_test_with_newFeatures.cache() taxi_df_test_with_newFeatures.count() ## INDEX AND ONE-HOT ENCODING stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex") model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above indexed = model.transform(taxi_df_test_with_newFeatures) encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec") encoded1 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex") model = stringIndexer.fit(encoded1) indexed = model.transform(encoded1) encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec") encoded2 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex") model = stringIndexer.fit(encoded2) indexed = model.transform(encoded2) encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec") encoded3 = encoder.transform(indexed)
from csv import reader from pyspark.mllib.recommendation import * from pyspark.sql.functions import format_string tuple_path = "s3://million-song-dataset-yizhou/TasteProfile/train_triplets.txt" df = spark.read.load(tuple_path, format="csv", sep="\t", inferSchema="true", header=None) # Transform index from pyspark.ml.feature import StringIndexer user_indexer = StringIndexer(inputCol="_c0", outputCol="user_index") song_indexer = StringIndexer(inputCol="_c1", outputCol="song_index") partial_indexed = user_indexer.fit(df).transform(df) indexed = song_indexer.fit(partial_indexed).transform(partial_indexed) indexed.createOrReplaceTempView("indexed") res_df = spark.sql("select user_index, song_index, _c2 as click from indexed") res_df = res_df.select(format_string("%.0f,%.0f,%d",res_df.user_index,res_df.song_index,res_df.click)) rdd = res_df.rdd.flatMap(list).map(lambda x:x.split(",")) model = ALS.trainImplicit(rdd, 25, seed=10) # Generate userIndex temp1 = spark.sql("select distinct _c0,user_index from indexed") user_df = temp1.select(format_string("%s,%.0f",temp1._c0,temp1.user_index)) user_df.write.save("s3://million-song-dataset-yizhou/TasteProfile/userIndex",format="text") # Generate songIndex
print "Creating sparse vectors for all data based on this new dictionary" t0 = time() dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema) dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema) dfTrainSelect.take(1) dfTestSelect.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[328]: from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect) # In[329]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10) # In[330]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
df_tip = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")\ .option("uri","mongodb://"+ip_address+"/yelp.tip").load() rstdata = sc.textFile('./Collaborative-Filtering/data/restaurant_ids_final.txt').map(lambda x: (x,1))\ .toDF(['business_id','biz_ix']).select('business_id') df_review.persist() df_tip.persist() rstdata.persist() ## Mapping userMap = df_review.select('user_id').union( df_tip.select('user_id')).distinct() indexer_userid = StringIndexer(inputCol="user_id", outputCol="user_ix") userMap = indexer_userid.fit(userMap).transform(userMap) bizMap = df_review.select('business_id').union( df_tip.select('business_id')).distinct() indexer_biz = StringIndexer(inputCol="business_id", outputCol="biz_ix") bizMap = indexer_biz.fit(bizMap).transform(bizMap) bizMap = rstdata.join(bizMap, on='business_id', how='inner') ## Join Dataframe to the review table df_review_als = df_review.select('user_id', 'business_id', 'stars') df_review_als = df_review_als.join(userMap, on='user_id', how='left_outer').join(bizMap, on='business_id', how='inner') ALS_baseline_df = df_review_als.select('user_ix', 'biz_ix', 'stars')
# MAGIC %md # MAGIC In this dataset, we have ordinal variables like education (Preschool - Doctorate), and also nominal variables like relationship (Wife, Husband, Own-child, etc). For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. It might be possible here to improve prediction accuracy by converting each categorical column with an appropriate method. # MAGIC # MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector). # COMMAND ---------- ###One-Hot Encoding from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") model = stringIndexer.fit(dataset) indexed = model.transform(dataset) # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") encoded = encoder.transform(indexed) dataset = encoded print dataset.take(1) # COMMAND ---------- # MAGIC %md # MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row. # COMMAND ----------
#print(df_raw6.filter(df_raw6['text'] == '').count()) # In[ ]: # In[71]: from pyspark.ml.feature import StringIndexer indexer=StringIndexer(inputCol='_c14',outputCol='OpenStatus_cat') indexed=indexer.fit(df_raw5).transform(df_raw5) # In[72]: indexed.show() # In[73]: df_raw8 = indexed.select("text","OpenStatus_cat") # In[74]:
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StringIndexer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StringIndexerExample") sqlContext = SQLContext(sc) # $example on$ df = sqlContext.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() # $example off$ sc.stop()
stopWords=None if language == "english" else StopWordsRemover.loadDefaultStopWords(language)) df = remover.transform(df) # Now the magic of windowing the text with F.explode() win = windowing(winz) decompose = win.get_udf() df = df.withColumn("slides", decompose("tokens")) \ .withColumn("exploded", F.explode("slides")) \ .withColumn("word", get_mid("exploded")) \ .withColumn("window", rm_mid("exploded")) df = df.drop(*[c for c in df.columns if not c in ["word", "window"]]) indexer = StringIndexer(inputCol="word", outputCol="label") df = indexer.fit(df).transform(df) #.persist(StorageLevel.DISK_ONLY)#MEMORY_AND_DISK) hashingTF = HashingTF(inputCol="window", outputCol="rawFeatures") df = hashingTF.transform(df) idf = IDF(inputCol="rawFeatures", outputCol="features") #"idfFeatures") idfModel = idf.fit(df) df = idfModel.transform(df).drop("rawFeatures") #pca = PCA(k=3, inputCol="idfFeatures", outputCol="features") #model = pca.fit(df).transform(df) train, test = df.randomSplit([0.7, 0.3], 24) lr = LogisticRegression(regParam=0.001) model = lr.fit(train)