",").load(input_path + "\\adult.csv") data = data.withColumnRenamed("age", "label").select( "label", col("education-num").alias("education-num"), col(" hours-per-week").alias("hours-per-week"), col(" education").alias("education"), col(" fnlwgt").alias("fnlwgt"), col(" sex").alias("sex"), col(" relationship").alias("relationship")) data = data.select(data.label.cast("double"), "education-num", "hours-per-week", "education", "sex", "fnlwgt", "relationship") new_data = data.toDF("label", "education-num", "hours-per-week", "education", "sex", "fnlwgt", "relationship") indexer = StringIndexer(inputCol="education", outputCol="new_education") indexed = indexer.fit(new_data).transform(new_data) indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex") indexed1 = indexer1.fit(indexed).transform(indexed) indexer2 = StringIndexer(inputCol="relationship", outputCol="new_rel") indexed2 = indexer2.fit(indexed1).transform(indexed1) indexed2 = indexed2.drop("sex", "education", "relationship") indexed2.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features") data = assembler.transform(indexed2)
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("DecisionTreeClassificationExample")\ .getOrCreate() # $example on$ # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load( "data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence") sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) rfClassifier = RandomForestClassifier(numTrees=10, maxDepth=10, seed=0, labelCol="indexed") start_time = time.time() modelClassifier = rfClassifier.fit(trainingData) end_time = time.time() cost_time = end_time - start_time print("spark rf time :", cost_time) predictionsClassifier = modelClassifier.transform(testData)
def main(spark, trainFilePath, valFilePath, downPercentage): ''' trainFilePath: path of training set valFilePath: path of validation file or test file, whichever the evaluation is based on. downPercentage: percentage of training set used to train the model. Use 1 if using the full set. output: Print the precedure of the whole process, including MAP and precision at 500 for each configuration. At last, print the dictionary containing all configuration and scores, and then print the configuration that has the highest MAP score and the score itself. ''' # Get Train sample & Validation downPercentage = float(downPercentage) if 'validation' in valFilePath: print('Using ' + str(downPercentage * 100) + '% of the training set and the validation set...') else: print('Using ' + str(downPercentage * 100) + '% of the training set and the test set...') train = spark.read.parquet(trainFilePath) train_sample = train.sample(False, downPercentage, seed=0) val = spark.read.parquet(valFilePath) # Generate indexers and fit to train indexerUser = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid='skip') indexerTrack = StringIndexer(inputCol="track_id", outputCol="track_index", handleInvalid='skip') print('Generate the model for transforming user_id and track_id') indexers = Pipeline(stages=[indexerUser, indexerTrack]) model = indexers.fit(train_sample) # Transform indexers to train sample and val print('Transform user_id and track_id into numerical values') train_sample = model.transform(train_sample) val_sample = model.transform(val) # Get intersection of user indexs valUsers = val_sample.select('user_index').distinct() # Initialize ALS Parameters param = [[0.001, 0.01, 0.1], [5, 8, 10], [0.3, 0.7, 1]] config = list(itertools.product(*param)) print('Hyper-parameter tuning...') performance = {} # Grid Search for conf in config: print('Configuration: regParam = ' + str(conf[0]) + ', rank = ' + str(conf[1]) + ', alpha = ' + str(conf[2]) + '.') print('Generating model...') als = ALS(alpha=conf[2], rank=conf[1], regParam=conf[0], userCol="user_index", itemCol="track_index", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) model_als = als.fit(train_sample) print('Getting the Prediction list...') # Get top 500 recommended items for val users: Prediction List top500_val = model_als.recommendForUserSubset(valUsers, 500).cache() predList = top500_val.select( top500_val.user_index, top500_val.recommendations.track_index.alias('pred_list')) print('Getting the True list...') # Build True List trueList = val_sample.groupBy('user_index')\ .agg(expr('collect_list(track_index) as true_list')) # Join The lists and generate RDD for ranking metric trueList = trueList.alias('trueList') predList = predList.alias('predList') predTrueList = predList.join( trueList, predList.user_index == trueList.user_index).select( 'predList.pred_list', 'trueList.true_list') predictionAndLabels = predTrueList.rdd.map( lambda row: (row.pred_list, row.true_list)) print('Getting the evaluation...') # Build Evaluator and get MAP rankmetrics = RankingMetrics(predictionAndLabels) performance[conf] = [ rankmetrics.meanAveragePrecision, rankmetrics.precisionAt(500) ] print('The MAP is: ' + str(performance[conf][0])) print('The Precision at 500 is: ' + str(performance[conf][1])) print(performance) best_config = list(performance.keys())[np.argmax( [i[0] for i in performance.values()])] print( 'The best MAP performance comes from the configuration: regParam = ' + str(best_config[0]) + ', rank = ' + str(best_config[1]) + ', alpha = ' + str(best_config[2]) + '.') print('The MAP is: ' + str(performance[best_config][0]) + '.')
# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure fMeasure = trainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # We can also use the multinomial family for binary classification mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") # Fit the model mlrModel = mlr.fit(left_join) # Print the coefficients and intercepts for logistic regression with multinomial family print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix)) print("Multinomial intercepts: " + str(mlrModel.interceptVector)) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(left_join)
def transformation(self, df1, sqlContext): ## continous to integer buckets discretizer1 = QuantileDiscretizer(handleInvalid="keep", numBuckets=15, inputCol="price_cents", outputCol="price_cents_q") discretizer2 = QuantileDiscretizer(handleInvalid="keep", numBuckets=5, inputCol="images_count", outputCol="images_count_q") discretizer3 = QuantileDiscretizer(handleInvalid="keep", numBuckets=5, inputCol="utilities_count", outputCol="utilities_q") #categorical/string to integers stringIndexer1 = StringIndexer(handleInvalid='keep', inputCol='city', outputCol="city_i") stringIndexer2 = StringIndexer(handleInvalid='keep', inputCol='province', outputCol="province_i") stringIndexer3 = StringIndexer(handleInvalid='keep', inputCol='parking_types', outputCol="parking_types_i") stringIndexer4 = StringIndexer(handleInvalid='keep', inputCol='property_sub_type', outputCol="property_sub_type_i") stringIndexer5 = StringIndexer(handleInvalid='keep', inputCol='postal', outputCol="postal_i") #stringIndexer6 = StringIndexer(handleInvalid='keep', inputCol='address_street', outputCol="address_street_i") encoder = OneHotEncoderEstimator( inputCols=[ "price_cents_q", "images_count_q", "utilities_q", "city_i", "province_i", "parking_types_i", "property_sub_type_i", "postal_i", "year_built", "beds", "bathrooms", "has_garage", "has_fireplace", "has_pool", "has_basement" ], outputCols=[ "price_cents_o", "images_count_o", "utilities_o", "city_o", "province_o", "parking_types_o", "property_sub_type_o", "postal_o", "year_built_o", "beds_o", "bathrooms_o", "has_garage_o", "has_fireplace_o", "has_pool_o", "has_basement_o" ]) logging.warning(" pipeline called - ") try: stages = [ discretizer1, discretizer2, discretizer3, stringIndexer1, stringIndexer2, stringIndexer3, stringIndexer4, stringIndexer5, encoder ] pipeline = Pipeline(stages=stages) model = pipeline.fit(df1) df2 = model.transform(df1) except Exception as e: logging.exception("EXCEPTION - Pipeline Logic") raise e logging.warning(" pipeline finished - ") features = [ "price_cents_o", "price_cents_o", "images_count_o", "utilities_o", "city_o", "city_o", "city_o", "province_o", "province_o", "province_o", "province_o", "province_o", "parking_types_o", "property_sub_type_o", "postal_o", "postal_o", "year_built_o", "beds_o", "beds_o", "bathrooms_o", "has_garage_o", "has_fireplace_o", "has_pool_o", "has_basement_o" ] logging.warning(" VectorAssembler called - ") try: assembler = VectorAssembler(inputCols=features, outputCol="scaled_features") df3 = assembler.transform(df2) except Exception as e: print("Error in assembler logic - " + str(e)) raise e logging.warning(" VectorAssembler finished - ") return df3
bankLp = bankData.rdd.map(transformToLabeledPoint) bankLp.collect() bankDF = spark.createDataFrame(bankLp, ["label", "features"]) bankDF.select("label", "features").show(10) #Perform PCA from pyspark.ml.feature import PCA bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures") pcaModel = bankPCA.fit(bankDF) pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures") pcaResult.show(truncate=False) #Indexing needed as pre-req for Decision Trees from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(pcaResult) td = si_model.transform(pcaResult) td.collect() #Split into training and testing data (trainingData, testData) = td.randomSplit([0.7, 0.3]) trainingData.count() testData.count() testData.collect() from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Create the model rmClassifer = RandomForestClassifier(labelCol="indexed",
if __name__ == "__main__": spark = SparkSession \ .builder \ .getOrCreate() # Prepare data raw = spark.read.csv("hdfs://devenv/user/spark/spark_mllib_101/titanic", inferSchema=True, header=True) # Preprocessing and feature engineering data = raw.select("Survived","Pclass","Sex","Age","Fare","Embarked") \ .dropna() feature_prep = StringIndexer( inputCol="Sex", outputCol="SexIndex").fit(data).transform(data) feature_prep = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec").transform(feature_prep) feature_prep = StringIndexer( inputCol="Embarked", outputCol="EmbarkIndex").fit(feature_prep).transform(feature_prep) feature_prep = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec").transform(feature_prep) final_data = VectorAssembler( inputCols=["Survived", "Pclass", "SexVec", "Age", "Fare", "EmbarkVec"], outputCol="features").transform(feature_prep)
tw2 = tw1.filter("polarity != 2").withColumn('words', tokenize(tw1['text'])) tw3 = (tw2.select( "user", "hour", "dayofweek", "month", "words", F.when(tw2.polarity == 4, "Pos").otherwise("Neg").alias("sentiment"), pos_score(tw2["words"]).alias("pscore"), neg_score(tw2["words"]).alias("nscore"))) tw3.registerTempTable("fm") # 분류 모델 구축 # 모델링 매개변수 numFeatures = 5000 minDocFreq = 50 numTrees = 1000 # 머신 러닝 파이프라인 구축 inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx") inx2 = StringIndexer(inputCol="month", outputCol="month-inx") inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx") inx4 = StringIndexer(inputCol="sentiment", outputCol="label") hashingTF = HashingTF(numFeatures=numFeatures, inputCol="words", outputCol="hash-tf") idf = IDF(minDocFreq=minDocFreq, inputCol="hash-tf", outputCol="hash-tfidf") va = VectorAssembler(inputCols=[ "hour-inx", "month-inx", "dow-inx", "hash-tfidf", "pscore", "nscore" ], outputCol="features") rf = RandomForestClassifier(numTrees=numTrees, maxDepth=4, maxBins=32, labelCol="label",
outputCol="words", pattern="\\W") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT", "@"] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes indexer = StringIndexer(inputCol="polarity", outputCol="label") # feature-selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selectedFeatures", labelCol="label") # logistic regression model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) # build the pipeline pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, countVectors, indexer, selector, lr ])
# Get number of records print("The data contain %d records." % flights.count()) # Remove the 'flight' column flights = flights.drop('flight') # Convert 'mile' to 'km' and drop 'mile' column flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \ .drop('mile') # Remove records with missing values in any column and get the number of remaining rows flights = flights.dropna() print("The data contains %d records after dropping records with na values." % flights.count()) # Create an indexer for carrier categorical feature indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"]) # Apply the one hot encoder to the flights data onehot = onehot.fit(flites)
'JobInvolvement','JobLevel','JobSatisfaction',\ 'MonthlyIncome','MonthlyRate','NumCompaniesWorked',\ 'PercentSalaryHike','PerformanceRating','RelationshipSatisfaction',\ 'StandardHours','StockOptionLevel','TotalWorkingYears',\ 'TrainingTimesLastYear','WorkLifeBalance',\ 'YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'] # In[99]: indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols ] encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] # In[100]: assembler = VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols, outputCol="features")
def MLClassifierDFPrep(df, input_columns, dependent_var, treat_outliers=True, treat_neg_values=True): # change label (class variable) to string type to prep for reindexing # Pyspark is expecting a zero indexed integer for the label column. # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method renamed = df.withColumn("label_str", df[dependent_var].cast( StringType())) #Rename and change to string type indexer = StringIndexer( inputCol="label_str", outputCol="label") #Pyspark is expecting the this naming convention indexed = indexer.fit(renamed).transform(renamed) print(indexed.groupBy("class", "label").count().show(100)) # Convert all string type data in the input column list to numeric # Otherwise the Algorithm will not be able to process it numeric_inputs = [] string_inputs = [] for column in input_columns: if str(indexed.schema[column].dataType) == 'StringType': indexer = StringIndexer(inputCol=column, outputCol=column + "_num") indexed = indexer.fit(indexed).transform(indexed) new_col_name = column + "_num" string_inputs.append(new_col_name) else: numeric_inputs.append(column) if treat_outliers == True: print("We are correcting for non normality now!") # empty dictionary d d = {} # Create a dictionary of quantiles for col in numeric_inputs: d[col] = indexed.approxQuantile( col, [0.01, 0.99], 0.25 ) #if you want to make it go faster increase the last number #Now fill in the values for col in numeric_inputs: skew = indexed.agg(skewness( indexed[col])).collect() #check for skewness skew = skew[0][0] # This function will floor, cap and then log+1 (just in case there are 0 values) if skew > 1: indexed = indexed.withColumn( col, log( when(df[col] < d[col][0], d[col][0]).when( indexed[col] > d[col][1], d[col][1]).otherwise( indexed[col]) + 1).alias(col)) print( col + " has been treated for positive (right) skewness. (skew =)", skew, ")") elif skew < -1: indexed = indexed.withColumn( col, exp( when(df[col] < d[col][0], d[col][0]).when( indexed[col] > d[col][1], d[col][1]).otherwise(indexed[col])).alias(col)) print( col + " has been treated for negative (left) skewness. (skew =", skew, ")") # Produce a warning if there are negative values in the dataframe that Naive Bayes cannot be used. # Note: we only need to check the numeric input values since anything that is indexed won't have negative values minimums = df.select([ min(c).alias(c) for c in df.columns if c in numeric_inputs ]) # Calculate the mins for all columns in the df min_array = minimums.select(array(numeric_inputs).alias( "mins")) # Create an array for all mins and select only the input cols df_minimum = min_array.select(array_min( min_array.mins)).collect() # Collect golobal min as Python object df_minimum = df_minimum[0][0] # Slice to get the number itself features_list = numeric_inputs + string_inputs assembler = VectorAssembler(inputCols=features_list, outputCol='features') output = assembler.transform(indexed).select('features', 'label') # final_data = output.select('features','label') #drop everything else # Now check for negative values and ask user if they want to correct that? if df_minimum < 0: print(" ") print( "WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values" ) print(" ") if treat_neg_values == True: print( "You have opted to correct that by rescaling all your features to a range of 0 to 1" ) print(" ") print("We are rescaling you dataframe....") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(output) # rescale each feature to range [min, max]. scaled_data = scalerModel.transform(output) final_data = scaled_data.select( 'label', 'scaledFeatures') # added class to the selection final_data = final_data.withColumnRenamed('scaledFeatures', 'features') print("Done!") else: print( "You have opted not to correct that therefore you will not be able to use to Naive Bayes classifier" ) print("We will return the dataframe unscaled.") final_data = output return final_data
df.printSchema() from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate' ], outputCol='features') output = assembler.transform(df) from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex') outputFixed = indexer.fit(output).transform(output) outputFixed.printSchema() final_data = outputFixed.select('features', 'PrivateIndex') train_data, test_data = final_data.randomSplit([0.75, 0.25]) from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier from pyspark.ml import Pipeline dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features') rfc = RandomForestClassifier(numTrees=25, labelCol='PrivateIndex',
def main(path,test_path,write_location,test_write_location): sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] if is_comp: df = spark.read.csv(path,header=True).cache() npart = 10000 else: # define schema structs = [StructField("Label", IntegerType(), True)] for dfeat in dense_features: structs.append(StructField(dfeat,DoubleType(), True)) for cfeat in sparse_features: structs.append(StructField(cfeat,StringType(), True)) schema = StructType(structs) df1 = spark.read.csv(path+"/day_[0-9]",sep="\t",schema=schema) df2 = spark.read.csv(path+"/day_1[0-9]",sep="\t",schema=schema) tcolumns = df1.columns df = df1.select(tcolumns).union(df2.select(tcolumns)) npart = 860000 print("Num examples: ",df.count()) # transformation training data # change datatype of dense features for col_t in dense_features: df = df.withColumn(col_t,col(col_t).cast(DoubleType())) ## fill nulls df = df.fillna('NULL',subset=sparse_features) df = df.fillna(0.,subset=dense_features) # compute statistics ## dense features scaled_max = 1 scaled_min = 0 dense_meta = {} for col_t in dense_features: print("dense: ",col_t) min_t = df.agg({col_t:"min"}).collect()[0][0] max_t = df.agg({col_t:"max"}).collect()[0][0] dense_meta[col_t] = [min_t, max_t] df = df.withColumn(col_t+"_scaled",(col(col_t)-min_t)/(max_t-min_t)*(scaled_max-scaled_min)+scaled_min) df = df.drop(col_t).withColumnRenamed(col_t+"_scaled",col_t) ## index categoricals indexers = {} for col_t in sparse_features: print("cat:",col_t) indexer = StringIndexer(inputCol=col_t, outputCol=col_t+"_indexed") fitted_indexer = indexer.fit(df) df = fitted_indexer.transform(df) indexers[col_t] = fitted_indexer # save indexer for test data df = df.drop(col_t).withColumnRenamed(col_t+"_indexed",col_t) df = df.withColumn(col_t,col(col_t).cast(IntegerType())) # convert label dtype df = df.withColumn("Label",col("Label").cast(DoubleType())) # save statistics/meta data locally all_index = {} for xk in indexers.keys(): x = indexers[xk] index2name = dict([y for y in zip(range(len(x.labels)),x.labels)]) name2index = {v: k for k, v in index2name.items()} all_index[xk] = {'index2name':index2name, 'name2index':name2index} cat_meta = {} for xk in indexers.keys(): x = indexers[xk] cat_meta[xk] = len(x.labels) json.dump(cat_meta,open("categorical-meta.json","w")) json.dump(all_index,open("categorical.json",'w')) json.dump(dense_meta,open("dense-meta.json",'w')) # (optional) store in s3 # Insert uploading code here # save training data df = df.repartition(npart) df.write.mode("overwrite").csv(write_location,header=True) # read test data if is_comp: df = spark.read.csv(test_path,header=True) else: df = spark.read.csv(test_path,sep="\t",schema=schema) # transform test data # change datatype of dense features for col_t in dense_features: df = df.withColumn(col_t,col(col_t).cast(DoubleType())) ## fill nulls df = df.fillna('NULL',subset=sparse_features) df = df.fillna(0.,subset=dense_features) # use already computed statistics ## dense features dense_meta = {} for col_t in dense_features: min_t = dense_meta[col_t][0] max_t = dense_meta[col_t][1] dense_meta[col_t] = [min_t, max_t] df = df.withColumn(col_t+"_scaled",(col(col_t)-min_t)/(max_t-min_t)*(scaled_max-scaled_min)+scaled_min) df = df.drop(col_t).withColumnRenamed(col_t+"_scaled",col_t) ## index categoricals indexers = {} for col_t in sparse_features: fitted_indexer = indexers[col_t] df = fitted_indexer.transform(df) indexers[col_t] = fitted_indexer # save indexer for test data df = df.drop(col_t).withColumnRenamed(col_t+"_indexed",col_t) df = df.withColumn(col_t,col(col_t).cast(IntegerType())) # convert label dtype df = df.withColumn("Label",col("Label").cast(DoubleType())) df = df.repartition(npart) df.write.mode("overwrite").csv(test_write_location,header=True)
print(categorical_data) # COMMAND ---------- # MAGIC %sql # MAGIC # MAGIC select distinct education_1989_revision from CDCData # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler stages = [] # stages in our Pipeline for categoricalCol in categorical_data: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # COMMAND ---------- # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="method_of_disposition", outputCol="label") stages += [label_stringIdx] # COMMAND ---------- # Transform all features into a vector using VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline # Load training data from pyspark.sql import SparkSession spark = SparkSession.builder.appName('Auto').getOrCreate() data = spark.read.csv( "/Users/sai/Documents/GitHub/CSEE5590_BIGDATA_PROGAMMING_Fall2018/ICP14/adult.csv", header=True, inferSchema="true") from pyspark.ml.feature import StringIndexer # Convert target into numerical categories labelIndexer = StringIndexer(inputCol="Salary", outputCol="label") from pyspark.ml.feature import VectorAssembler featureAssembler = VectorAssembler( inputCols=["Age", "sex", "capital-gain", "capital-loss", "hours-per-week"], outputCol='features') #output = featureAssembler.transform(data) splits = data.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
.na.fill(0)\ .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\ .coalesce(5) # COMMAND ---------- trainDataFrame = preppedDataFrame\ .where("InvoiceDate < '2011-07-01'") testDataFrame = preppedDataFrame\ .where("InvoiceDate >= '2011-07-01'") # COMMAND ---------- from pyspark.ml.feature import StringIndexer indexer = StringIndexer()\ .setInputCol("day_of_week")\ .setOutputCol("day_of_week_index") # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder encoder = OneHotEncoder()\ .setInputCol("day_of_week_index")\ .setOutputCol("day_of_week_encoded") # COMMAND ---------- from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler()\ .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
inferSchema="true", header="false") # test = spark.read.load("./val/part-00002", # format="csv", sep="\t", inferSchema="true", header="false") # total = train.union(test) train, val, _ = train1.randomSplit([0.01, 0.01, 0.98]) train = train.rdd.map(lambda x: [x[10], x[8], x[6], x[1] % 1000]).toDF( ["_c10", "_c8", "_c6", "_c1"]) val = val.rdd.map(lambda x: [x[10], x[8], x[6], x[1] % 1000]).toDF( ["_c10", "_c8", "_c6", "_c1"]) total = train.union(val) # create features indexer = StringIndexer(inputCol="_c10", outputCol="c21") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) indexer = StringIndexer(inputCol="_c8", outputCol="c23") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) indexer = StringIndexer(inputCol="_c6", outputCol="c24") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) # create label indexer = StringIndexer(inputCol="_c1", outputCol="label") indexer = indexer.fit(total) train = indexer.transform(train)
data["DEPARTURE_DELAY"].cast(DoubleType())).where("CANCELLED = 0") data = data.withColumnRenamed('DEPARTURE_DELAY', 'label') data = data.dropna() ########## # separo en train y test sets train_data, test_data = data.randomSplit([0.7, 0.3]) ########## # transformacion de variables para el pipeline day_of_week_indexer = StringIndexer(inputCol="DAY_OF_WEEK", outputCol="DAY_OF_WEEK_CATEGORICAL") airline_indexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_CATEGORICAL") hour_departure_indexer = StringIndexer(inputCol="HOUR_DEPARTURE", outputCol="HOUR_DEPARTURE_CATEGORICAL") day_of_week_encoder = OneHotEncoder(inputCol="DAY_OF_WEEK_CATEGORICAL", outputCol="DAY_OF_WEEK_DUMMY") airline_encoder = OneHotEncoder(inputCol="AIRLINE_CATEGORICAL", outputCol="AIRLINE_DUMMY") hour_departure_encoder = OneHotEncoder(inputCol="HOUR_DEPARTURE_CATEGORICAL", outputCol="HOUR_DEPARTURE_DUMMY") assembler = VectorAssembler(inputCols = ["DAY_OF_WEEK_DUMMY", "AIRLINE_DUMMY", "HOUR_DEPARTURE_DUMMY"], outputCol = "features") ########## # modelos # modelo 1
], how="any", thresh=12) df_node3 = df_node2.randomSplit(seed=1234, weights=[0.6, 0.2, 0.2]) df_node3[2].write.format("parquet").save( path="hdfs://namenode:9000/example4/test.parquet") mmi_value_0_node4 = ["Sex", "Embarked", "Survived"] mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"] stages_node4 = [] for i in range(len(mmi_value_0_node4)): stages_node4.append( StringIndexer(inputCol=mmi_value_0_node4[i], outputCol=mmi_value_1_node4[i], handleInvalid="error", stringOrderType="frequencyDesc")) mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"] mmi_value_1_node5 = ['sexVec', 'embarkedVec'] stages_node5 = [] for i in range(len(mmi_value_0_node5)): stages_node5.append( OneHotEncoder(inputCol=mmi_value_0_node5[i], outputCol=mmi_value_1_node5[i])) pipeline_stage_node6 = VectorAssembler( outputCol="features", inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"]) stages_node7 = [stages_node4, stages_node5, pipeline_stage_node6]
print("********* TRAINING DATA ***********") print(train.limit(10).toPandas()) reg = 0.1 # Load Regularization Rate from argument if len(sys.argv) > 1: reg = float(sys.argv[1]) print("Regularization Rate is {}.".format(reg)) run_logger.log("Regularization Rate", reg) # create a new Logistic Regression model. lr = LogisticRegression(regParam=reg) # string-index and one-hot encode the education column si1 = StringIndexer(inputCol=' education', outputCol='ed') ohe1 = OneHotEncoder(inputCol='ed', outputCol='ed-encoded') # string-index and one-hot encode the matrial-status column si2 = StringIndexer(inputCol=' marital-status', outputCol='ms') ohe2 = OneHotEncoder(inputCol='ms', outputCol='ms-encoded') # string-index the label column into a column named "label" si3 = StringIndexer(inputCol=' income', outputCol='label') # assemble the encoded feature columns in to a column named "features" assembler = VectorAssembler( inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'], outputCol="features") # put together the pipeline
.config("spark.executor.cores", 4) \ .config("spark.driver.memory", "12g") \ .getOrCreate() train = spark.read.load("hdfs://10.190.2.112/data/train_set.txt", format="csv", sep="\t", inferSchema="true", header="false") val = spark.read.load("hdfs://10.190.2.112/data/val_set.txt", format="csv", sep="\t", inferSchema="true", header="false") test = spark.read.load("hdfs://10.190.2.112/data/val_set.txt", format="csv", sep="\t", inferSchema="true", header="false") # only for feature transform total = train.union(val).union(test) # create features indexer = StringIndexer(inputCol="_c12", outputCol="c22") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # create label indexer = StringIndexer(inputCol="_c11", outputCol="label") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # One-hot encoder encoder = OneHotEncoder(inputCol="c22", outputCol="c2") train = encoder.transform(train) val = encoder.transform(val) test = encoder.transform(test)
def transform(df, spark, sql_query = None, numerical_features = [], categorical_features = [],\ normalize = True, normalize_p=2): # Apply SQL query if sql_query != None: df.createOrReplaceTempView("netlytics") # Execute Query result_df = spark.sql(sql_query) df = result_df # Transform Strings in OneHot schema = df.schema feat_to_type = {} for struct in schema: feat_to_type[struct.name] = str(struct.dataType) for feature in categorical_features: # Replaces None k = col(feature) df = df.withColumn(feature, when(k.isNull(), "__NA__").otherwise(k)) stringIndexer = StringIndexer(inputCol=feature, outputCol=feature + "_indexed", handleInvalid="skip") model = stringIndexer.fit(df) df = model.transform(df) encoder = OneHotEncoder(inputCol=feature + "_indexed", outputCol=feature + "_encoded") df = encoder.transform(df) # Extract Features def extract_features(row, numerical_features, feat_to_type): output_features = {} fields = list(row.asDict().keys()) for field in fields: if field in numerical_features and feat_to_type[ field] != "StringType": output_features[field] = float(row[field]) if field.endswith("_encoded"): output_list = list(row[field]) for i, v in enumerate(output_list): tmp_field = field + "_" + str(i) output_features[tmp_field] = float(v) features = [ v for k, v in sorted(output_features.items(), key=operator.itemgetter(0)) ] old_dict = row.asDict() old_dict["features"] = DenseVector(features) new_row = Row(**old_dict) return new_row #spark = df.rdd. rdd = df.rdd.map( lambda row: extract_features(row, numerical_features, feat_to_type)) df = spark.createDataFrame(rdd, samplingRatio=1, verifySchema=False) # Normalize if normalize: normalizer = Normalizer(inputCol="features", outputCol="featuresNorm", p=normalize_p) df = normalizer.transform(df) df = df.drop("features") df = df.withColumnRenamed("featuresNorm", "features") # Delete intermediate columns: schema = df.schema feat_to_type = {} for struct in schema: feat_to_type[struct.name] = str(struct.dataType) for feature in feat_to_type: if feat_to_type[feature] != "StringType": if feature.endswith("_encoded") or feature.endswith("_indexed"): df = df.drop(feature) return df
# limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import IndexToString, StringIndexer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate() # $example on$ df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) converted.select("id", "originalCategory").show() # $example off$ spark.stop()
"total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls"] #Review DataSet Balance churn_data.registerTempTable("ChurnData") sqlResult = spark.sql("SELECT churned, COUNT(churned) as Churned FROM ChurnData group by churned") sqlResult.show() #Feature Engineering from pyspark.ml.feature import StringIndexer from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler #String to Index label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label') plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed') input_cols=['intl_plan_indexed'] + reduced_numeric_cols #Feature Vector Assembler assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features') #Standard Scaler scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False) #Configure Random Forest Classifier Model from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier rfclassifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'scaledFeatures')
users=spark.createDataFrame(fields) # +---+------+---+-------------+--------+ # |age|gender| id| occupations|postcode| # +---+------+---+-------------+--------+ # | 24| M| 1| technician| 85711| # | 53| F| 2| other| 94043| # | 23| M| 3| writer| 32067| # | 24| M| 4| technician| 43537| # | 33| F| 5| other| 15213| # | 42| M| 6| executive| 98101| # | 57| M| 7|administrator| 91344| # | 36| M| 8|administrator| 05201| # | 29| M| 9| student| 01002| # | 53| M| 10| lawyer| 90703| # | 39| F| 11| other| 30329| indexer = StringIndexer(inputCol="occupations", outputCol="occupationsIndex",handleInvalid='error') indexed=indexer.fit(users).transform(users) #transfer dataframe to rdd by ".rdd" all_occupations = set(indexed.select("occupations","occupationsIndex").rdd.map(lambda x:(x[0],x[1])).collect()) encoder = OneHotEncoder(inputCol="occupationsIndex", outputCol="occupationsVec") encoded = encoder.transform(indexed) encoded.select("occupations","occupationsVec").show() # +-------------+---------------+ # | occupations| occupationsVec| # +-------------+---------------+ # | technician|(20,[11],[1.0])| # | other| (20,[1],[1.0])| # | writer| (20,[7],[1.0])| # | technician|(20,[11],[1.0])| # | other| (20,[1],[1.0])| # | executive| (20,[8],[1.0])|
"content", "navigation", "View", "view", "mobile", "version", "Subscribe", "subscribe", "Now", "now", "Log", "log", "In", "in", "setting", "settings", "Site", "site", "Loading", "loading", "article", "next", "previous", "Advertisement", "ad", "advertisement", "Supported", "supported", "by", "Share", "share", "Page", "page", "Continue", "continue", "main", "story", "newsletter", "Sign", "Up", "Manage", "email", "preferences", "Not", "you", "opt", "out", "contact", "us", "anytime", "thank", "subscribing", "see", "more", "email" ] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered1").setStopWords(add_stopwords) stopwordsRemover1 = StopWordsRemover( inputCol="filtered1", outputCol="filtered").setStopWords(add_stopwords_1) #Extracting features label_stringIdx = StringIndexer(inputCol="category", outputCol="label") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, stopwordsRemover1, hashingTF, idf, label_stringIdx ]) #training the data -- Logistic regression pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) dataset.show(5) (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed=100)
def main(spark, train_data_file, test_data_file, model_file): time_a = time.time() start = time_a # Use Validation and Test user_id to filter Train data, to get the 110k mandatory users # Stored here hdfs:/user/dz584/cf_train_sample.parquet """ training_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_train.parquet') validation_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_validation.parquet') testing_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_test.parquet') validandtest_userid = validation_data.union(testing_data).select('user_id').distinct() validandtest_userid.createOrReplaceTempView('validandtest_userid') training_data.createOrReplaceTempView('training_data') training_data = spark.sql("SELECT * FROM training_data WHERE user_id IN (SELECT user_id FROM validandtest_userid GROUP BY user_id)") training_data.write.parquet("cf_train_sample.parquet") """ training_data = spark.read.parquet(train_data_file) indexer_id = StringIndexer(inputCol="user_id", outputCol="userindex").setHandleInvalid("skip") indexer_id_model = indexer_id.fit(training_data) indexer_item = StringIndexer(inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip") indexer_item_model = indexer_item.fit(training_data) training_data = indexer_id_model.transform(training_data) training_data = indexer_item_model.transform(training_data) testing_data = spark.read.parquet(test_data_file) testing_data = indexer_id_model.transform(testing_data) testing_data = indexer_item_model.transform(testing_data) training_data = training_data.select('userindex','itemindex','count') testing_data = testing_data.select('userindex','itemindex','count') # Add Log Compression training_data.createOrReplaceTempView('training_data') training_data = spark.sql("SELECT *, count+1 as plus_count FROM training_data") training_data = training_data.withColumn("log_count",F.log("plus_count")) print('Finished Indexing!') time_b = time.time() print(time_b - time_a) time_a = time_b result_dict = {} rank_list = [600]#[10,20,30,50] reg_param_list = [0.7]#[0.1,0.5] alpha_list = [1]#[1,1.5] for rank in rank_list: for reg_param in reg_param_list: for alpha in alpha_list: current_key = (rank,reg_param,alpha) als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="log_count", rank=rank, regParam=reg_param, alpha=alpha) model = als.fit(training_data) print('Finished Modeling with Param:', current_key) time_b = time.time() print(time_b - time_a) time_a = time_b prediction = model.recommendForAllUsers(500).select('userindex', 'recommendations.itemindex') print('Finished Prediction DF!') testing_df = testing_data.groupBy('userindex').agg(expr('collect_list(itemindex) as item_list')) print('Finished Label DF!') predictionAndLabels = prediction.join(testing_df, 'userindex') print('Joined Prediction and Labels!') time_b = time.time() print(time_b - time_a) time_a = time_b pred_df = predictionAndLabels.select(['itemindex','item_list']).rdd.map(list) metrics = RankingMetrics(pred_df) print('Ranking Metrics Calculated!') time_b = time.time() print(time_b - time_a) time_a = time_b eva = metrics.meanAveragePrecision result_dict[current_key] = eva print(current_key,"parameter combination has been trained! MAP= ", eva) time_b = time.time() print(time_b - time_a) time_a = time_b best_model_param = max(result_dict, key=result_dict.get) als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="count", rank=best_model_param[0], regParam=best_model_param[1], alpha=best_model_param[2]) als.fit(training_data).write().overwrite().save(model_file) print('Process Finished!') print(time.time() - start)
#pca choose features categorical = { 'Cat1': 11, 'Cat3': 7, 'Cat6': 7, 'Cat8': 4, 'Cat9': 2, 'Cat10': 4, 'Cat11': 7, 'Cat12': 7 } from pyspark.ml.feature import StringIndexer for col, num in categorical.items(): name = col + '_index' indexer = StringIndexer(inputCol=col, outputCol=name) data = indexer.fit(data).transform(data) data = data.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index', 'Cat9_index', 'Cat10_index', 'Cat11_index', 'Cat12_index', 'Calendar_Year', 'Model_Year', 'Claim_Amount') from pyspark.ml.feature import OneHotEncoderEstimator category = [ 'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index', 'Cat9_index', 'Cat10_index', 'Cat11_index', 'Cat12_index' ] new_cat = [] for col in category: name = col.replace('_index', '_vec') new_cat.append(name)