def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select((row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias('id'), "*") df_features = df_index.select('id', self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, 'id') left_df = df_features.select( df_features['id'].alias('left_id'), df_features[self.featureCol].alias('left_features')) right_df = df_features.select( df_features['id'].alias('right_id'), df_features[self.featureCol].alias('right_features')) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df['left_id'] != right_df['right_id']) # comupte cosine similarity between vectors joined_df = joined_df.select( 'left_id', 'right_id', cosine_similarity_udf( array(joined_df['left_features'], joined_df['right_features'])).alias('norm')) ranked = joined_df.select( 'left_id', 'right_id', rank().over( Window.partitionBy('left_id').orderBy('norm')).alias('rank')) knn = ranked.where(ranked['rank'] <= 5) knn_grouped = knn.groupBy('left_id').agg( f.collect_list('right_id').alias('nn')) # generate laplacian laplacian = knn_grouped.select( knn_grouped['left_id'].alias('id'), toVector_udf( laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], lit(n), lit(self.k_nearest))).alias('lap_vector')) pca = PCA(k=self.num_eigenvectors, inputCol='lap_vector', outputCol='features').fit(laplacian) eigenvectors = pca.transform(laplacian).select('id', 'features') model = KMeans(featuresCol='features', predictionCol=self.predictionCol, k=self.k).fit(eigenvectors) predictions = model.transform(eigenvectors).join(df_index, on='id') return predictions
def pca(dataset, inputCol, k=3): from pyspark.ml.feature import PCA model = PCA(k=3, inputCol=inputCol, outputCol=inputCol+'_pca').fit(dataset) return model.transform(dataset), model
def process_bus_data(bus_df): """ Method to process raw business data from Yelp.""" def select_elibigble_bus(row): """ Select businesses which fall into selected categores.""" global categories try: # Return true if business falls into category list, else false. row_cats = row.split(',') for cat in row_cats: if cat.strip() in categories: return True return False except (TypeError, AttributeError): # Returns false if business has no defined categories. return False def unpack_bus_attributes(row): """ Unpacks Business attributes and assigns them an index value.""" # List to store business attributes. unpacked = list() # Unpack all attributes except PriceRange and Parking temp = [row[s] for s in bus_attributes] # Process PriceRange try: priceRange = int(row["RestaurantsPriceRange2"]) except (TypeError, ValueError): # If no price range specified - default=2 priceRange = 2 #Process Parking try: parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1 except AttributeError: parking = 0 # Process WiFi if row["WiFi"] == 'no' or row["WiFi"] == "u'no'": wifi = -1 elif row["WiFi"] == None: wifi = 0 else: wifi = 1 # Tokenize all Boolean attributes. for i in temp: if i == "True": unpacked.append(1) elif i == "False": unpacked.append(-1) else: unpacked.append(0) # Append the Parking and PriceRange attributes unpacked.append(wifi) unpacked.append(parking) unpacked.append(priceRange) # Print any arrays that are not of desired length (=30). if len(unpacked) != 30: print(unpacked) return _convert_to_vector( csc_matrix(np.asarray(unpacked).astype(float)).T) def unpack_bus_categories(row): """Unpacks all business cattegories.""" # List to store business categories. unpacked = list() # Unpack all attributes except PriceRange and Parking for cat in row.split(','): unpacked.append(cat.strip()) return unpacked def unpack_price_range(row): """ Returns price range.""" return int(row[-1]) # Package the functions above into Spark SQL user-defined functions udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType()) udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT()) udf_unpack_bus_categories = udf(unpack_bus_categories, ArrayType(StringType())) udf_unpack_price_range = udf(unpack_price_range, IntegerType()) # Find businesses to include. eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \ .filter(col("include") == True) # Process business attributes feature. all_bus_attributes = set( bus_df.select("attributes").take(1)[0].attributes.asDict().keys()) bus_attributes_to_exclude = { 'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters', 'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours', 'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking', 'WiFi' } bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude) bus_attributes.sort() eligible_attr = eligible_bus.withColumn( "unpackedAttr", udf_unpack_bus_attributes(col("attributes"))) # Process business categories feature. eligible_cats = eligible_attr.withColumn( "unpackedCats", udf_unpack_bus_categories(col("categories"))) cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats") vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats) # Un-bundle price range from all other attributes. unpacked_pr = vectorized_cats.withColumn( "priceRange", udf_unpack_price_range(col("unpackedAttr"))) unpacked_pr.take(1) # Reduce dimensions of attributes and categories features, respectively. pca_attr = PCA(k=3, inputCol="unpackedAttr", outputCol="pcaAttr").fit(unpacked_pr) temp = pca_attr.transform(unpacked_pr) temp.show() pca_cats = PCA(k=1, inputCol="vectorizedCats", outputCol="pcaCats").fit(temp) temp2 = pca_cats.transform(temp) temp2.show() # Assemble into final feature vector. va = VectorAssembler( inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"], outputCol="featureVec") features = va.transform(temp2).select("business_id", "stars", "categories", "featureVec") features.take(1) # Unpack n_features = len(features.select("featureVec").take(1)[0].featureVec) final = features.withColumn("f", vector_to_array(col("featureVec"))) \ .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)]) return final, n_features
def apply_pca(training, testing): pca = PCA(k=3, inputCol="features", outputCol="pca_features").fit(training) training = pca.transform(training).cache() testing = pca.transform(testing).cache() return training, testing
def preprocessing(df, num_pca=10): argo_df_og = df # Cast temp as DoubleType() argo_df_og = argo_df_og.withColumn("tempTmp", argo_df_og['temp'].cast(DoubleType()))\ .drop("temp")\ .withColumnRenamed("tempTmp", "temp")\ .select("profile_id", "pres", "temp", "lat", "lon", "psal", "date")\ .persist() argo_filterby = argo_df_og.groupBy("profile_id") \ .agg(min("pres").alias("min_pres"), max("pres").alias("max_pres"), count("profile_id").alias("count_profile_id")) # Now, here are the profile_ids we want to keep, to be inner joined with original argo_df_og argo_keep_ids = argo_filterby.filter("count_profile_id >= 50 and min_pres <= 25 and max_pres >= 999") \ .select("profile_id") # Inner join the profile_ids to keep with original argo_df_og to filter and keep only desired IDs argo_df_keep = argo_keep_ids.join(argo_df_og, "profile_id", "inner").persist() #Final filtered df after pressure cleaning argo_df = argo_df_keep.select("profile_id", "pres", "temp", "lat", "lon", "psal", "date", month("date").alias("month"), year("date").alias("year")) \ .persist() #INTERPOLATION #Create vector mapping correspoding temperatures with pressures argo_df_listed = argo_df.select('profile_id', 'lat', 'lon', array(argo_df['temp'], argo_df['pres']).alias('temp_pres'))\ .groupBy('profile_id').agg(collect_list('temp_pres').alias('temp_pres_list'), fn.min(argo_df['lat']).alias('lat'), fn.min(argo_df['lon']).alias('lon')) # Ordering by pressure argo_df_listed = argo_df_listed.select( 'profile_id', 'lat', 'lon', insane_sort(argo_df_listed['temp_pres_list']).alias('temp_pres_list')) # Interpolating missing temps at specified grid points pres = argo_df_listed.select( 'profile_id', 'lat', 'lon', interp_udf('temp_pres_list').alias('temp_interp')) # Finding profiles with temps as nans check_pres = pres.select( "profile_id", "temp_interp", 'lat', 'lon', null_udf("temp_interp").alias("temp_interp_hasNA"), lenarray_udf("temp_interp").alias("temp_interp_len199")) # Filtering profiles with temps as nans filtered_pres = check_pres.filter("temp_interp_hasNA == False").select( "profile_id", "temp_interp", 'lat', 'lon') # Finding profiles with temps < -5 check_pres = filtered_pres.select( "profile_id", "temp_interp", 'lat', 'lon', neg_udf("temp_interp").alias("temp_interp_hasNeg5s")) # Filtering profiles with temps < -5 argo_df_clean = check_pres.filter("temp_interp_hasNeg5s == False").select( "profile_id", "temp_interp", 'lat', 'lon') argo_df_clean = argo_df_clean.select( 'profile_id', toVector_udf(argo_df_clean['temp_interp']).alias('features'), 'lat', 'lon') pca = PCA(k=num_pca, inputCol='features', outputCol='features_pca').fit(argo_df_clean) argo_df_clean = pca.transform(argo_df_clean) argo_df_clean = argo_df_clean.select( 'profile_id', argo_df_clean['features_pca'].alias('features'), 'lat', 'lon') return argo_df_clean
label_mapping = dict(enumerate(labelIndexer.labels())) reverse_mapping = {} for key in label_mapping: reverse_mapping[label_mapping[key]] = key # ## Dimensionality reduction # # Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA # In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df) train_df = pca.transform(train_df) test_df = pca.transform(test_df) # ## Classification algorithms # In[ ]: rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000) #rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000) model = rf.fit(train_df) # ## Evaluation & results # In[ ]:
test_df = labelIndexer.transform(test_df) label_mapping = dict(enumerate(labelIndexer.labels())) reverse_mapping = {} for key in label_mapping: reverse_mapping[label_mapping[key]] = key # ## Dimensionality reduction # # Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA # In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df) train_df = pca.transform(train_df) test_df = pca.transform(test_df) # ## Classification algorithms # In[ ]: rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000) #rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000) model = rf.fit(train_df) # ## Evaluation & results # In[ ]:
except: rel['features'] = Vectors.dense(0, 0, 0, 0, 0, 0) rel['label'] = str(x[14].strip('.')) return rel df = spark.sparkContext.textFile('./adult/adult.data').map(lambda line: line.split(',')).\ map(lambda x: Row(**f(x))).toDF() test = spark.sparkContext.textFile('./adult/adult.test').map(lambda line: line.split(',')).\ map(lambda x: Row(**f(x))).toDF() # 构建模型 pca = PCA(k=3, inputCol='features', outputCol='pcaFeatures').fit(df) result = pca.transform(df) test_data = pca.transform(test) result.show(truncate=False) test_data.show(truncate=False) # 在主成分分析的基础上做逻辑回归 labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(result) for label in labelIndexer.labels: print(label) featureIndexer = VectorIndexer(inputCol='pcaFeatures', outputCol='indexedFeatures').fit(result) print(featureIndexer.numFeatures) labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=labelIndexer.labels)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(filteredData) idf = IDF(inputCol="rawFeatures", outputCol="tfidffeatures") idfModel = idf.fit(featurizedData) tfidfData = idfModel.transform(featurizedData) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA(k = 5, inputCol = 'tfidffeatures', outputCol = 'features').fit(tfidfData) data_pca = pca.transform(tfidfData) data_pca.select("book_id", "features").show(truncate=False) # COMMAND ---------- # DBTITLE 1,K-means from pyspark.ml.clustering import KMeans from pyspark.ml.feature import VectorAssembler, StringIndexer from pyspark.ml.evaluation import ClusteringEvaluator kmeans = KMeans().setK(20).setSeed(1) model = kmeans.fit(data_pca) predictions = model.transform(data_pca)