def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def pca_model(data, k=7, inputcol='scale_features', outputcol='pca_features'): # new_df pca = PCA(k=k, inputCol=inputcol, outputCol=outputcol) model = pca.fit(data) variance = model.explainedVariance pca_data = model.transform(data) return variance, pca_data
def PCA_setting(spark, rdd, n): df = spark.createDataFrame(rdd,schema=['features']) pca = PCA(k=n,inputCol='features',outputCol='pca_features') model = pca.fit(df) return model.transform(df).select('pca_features').collect()
def _perform_pca(self, dataset: DataFrame, k: int): # Since we want to plot the clusters, it is important # downsize the dimensions to at most 3 dimensions. # We can use PCA with 3 principal components for this. pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(dataset) rows = pca_model \ .transform(dataset) \ .select("clusterNum", "pcaFeatures") \ .collect() # Now we'll plot the clusters as a 3D scatter plot with # each point's color corresponding to its cluster. # Cast cluterNum to string so it is treated as categorical # data for plotting purposes. axes = zip(*[row["pcaFeatures"] for row in rows]) colors = pd.Categorical([row["clusterNum"] for row in rows]) if k == 2: x, y = axes fig = plt.figure(figsize=(15, 15)) sns.scatterplot(x=x, y=y, hue=colors) if k == 3: x, y, z = axes plot_df = pd.DataFrame({"PCA 1": x, "PCA 2": y, "PCA 3": z, "cluster": colors}) g = sns.PairGrid(plot_df, hue="cluster", palette="coolwarm") g = g.map(sns.scatterplot, linewidths=0.75, edgecolor="w", s=40) g = g.add_legend() g.fig.set_size_inches(15, 15) # Specify number of principal components and clusters in model image_path = os.path.join("analysis", "results", "charts", f"pca-{k}-{self.model_name}.png") plt.savefig(image_path)
def _get_pca_model(feat_train, k): from pyspark.ml.feature import PCA pca = PCA(k=k, inputCol="features", outputCol="pca_features") pca_model = pca.fit(feat_train) # Explained Variance logr.log_event('Training Accuracy', f"{sum(pca_model.explainedVariance)}") return pca_model
def get_preprocessed_data(input_train, input_test): # Train Data train = spark.read.csv(input_train, header=False, inferSchema="true") train_labels = get_vector(train.select('_c0'), 'train_label') train_features = get_vector(train.drop('_c0'), 'feature') # Test Data test = spark.read.csv(input_test, header=False, inferSchema="true") test_labels = get_vector(test.select('_c0'), 'test_label') test_features = get_vector(test.drop('_c0'), 'feature') # Compute PCA pca = PCA(k=50, inputCol="feature", outputCol="pca_feature") pca_model = pca.fit(train_features) # Apply PCA to train / test features train_features_pca = pca_model.transform(train_features).select( "pca_feature") test_features_pca = pca_model.transform(test_features).select( "pca_feature") # Rename pca feature column values train_features_pca = train_features_pca.withColumnRenamed( "pca_feature", "train_feature") test_features_pca = test_features_pca.withColumnRenamed( "pca_feature", "test_feature") # Create combined train / test data train_data = combine_features_labels(train_features_pca, train_labels, 'train') test_data = combine_features_labels(test_features_pca, test_labels, 'test') return train_data, test_data
def pca(self, df, k=1): cov = RowMatrix( df.rdd.map(lambda x: list(x))).computeCovariance().toArray() col = cov.shape[1] eigVals, eigVecs = np.linalg.eigh(cov) inds = np.argsort(eigVals) eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]] eigVals = eigVals[inds[-1:-(col + 1):-1]] components = RowMatrix( df.rdd.map(lambda x: list(x))).computePrincipalComponents(k) train_data = df.rdd.map( lambda x: Row(features=Vectors.dense(x))).toDF() pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") model = pca.fit(train_data) score = model.transform(train_data) res = { "components": components.toArray(), "score": np.array( score.select("pcaFeatures").rdd.map( lambda x: list(x[0])).collect()), "eigVectors": eigVecs, "eigValues": eigVals } return res
def __build_pca(self, df, metadata_path): pca = PCA(k=self.k, inputCol='scaled_features', outputCol='pca_features') if self.__metadata: pca.fit(df).write().overwrite().save(metadata_path) return PCAModel.load(metadata_path).transform(df) return pca.fit(df).transform(df)
def pca(inputdir,df,alg,k): from pyspark.ml.feature import PCA pca = PCA(k=int(k),inputCol="features", outputCol="pca_features") model = pca.fit(df) outData = model.transform(df) pcaFeatures = outData.select("labels","pca_features") output_data = writeOut(inputdir,pcaFeatures,alg,k) return output_data
def pca_generic(data, dimens, input_col, output_col="pcaFeatures"): print('PCA Result with dimentions = ' + str(dimens) + ' with output column pcaFeatures') pca_generic = PCA(k=dimens, inputCol=input_col, outputCol=output_col) pca_model_generic = pca_generic.fit(data) result_pca_generic = pca_model_generic.transform(data) result_pca_generic.show() print('\n') return result_pca_generic, pca_model_generic
def runPCA(vector_features, k=3): from pyspark.ml.feature import PCA #convert df to feature_vec feature_vec = vector_features.select('features') pca = PCA(k, inputCol="features", outputCol="pcaFeatures") model = pca.fit(feature_vec) result = model.transform(feature_vec).select("pcaFeatures") return result
def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select((row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias('id'), "*") df_features = df_index.select('id', self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, 'id') left_df = df_features.select( df_features['id'].alias('left_id'), df_features[self.featureCol].alias('left_features')) right_df = df_features.select( df_features['id'].alias('right_id'), df_features[self.featureCol].alias('right_features')) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df['left_id'] != right_df['right_id']) # comupte cosine similarity between vectors joined_df = joined_df.select( 'left_id', 'right_id', cosine_similarity_udf( array(joined_df['left_features'], joined_df['right_features'])).alias('norm')) ranked = joined_df.select( 'left_id', 'right_id', rank().over( Window.partitionBy('left_id').orderBy('norm')).alias('rank')) knn = ranked.where(ranked['rank'] <= 5) knn_grouped = knn.groupBy('left_id').agg( f.collect_list('right_id').alias('nn')) # generate laplacian laplacian = knn_grouped.select( knn_grouped['left_id'].alias('id'), toVector_udf( laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], lit(n), lit(self.k_nearest))).alias('lap_vector')) pca = PCA(k=self.num_eigenvectors, inputCol='lap_vector', outputCol='features').fit(laplacian) eigenvectors = pca.transform(laplacian).select('id', 'features') model = KMeans(featuresCol='features', predictionCol=self.predictionCol, k=self.k).fit(eigenvectors) predictions = model.transform(eigenvectors).join(df_index, on='id') return predictions
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
async def plot_cluster(self, df, x='_3', y='_4'): pca = PCAml(k=2, inputCol="features", outputCol="pca") model3 = pca.fit(df) transformed2 = model3.transform(df) def extract(row): return (row.customer, ) + (row.prediction, ) + tuple( row.pca.toArray().tolist()) pcadf = transformed2.rdd.map(extract).toDF(["customer", "prediction"]) pcadf.show(10, False) pandad = pcadf.toPandas() pandad.plot.scatter(x=x, y=y, c='prediction', colormap='viridis') plt.show()
def train(df, hiperparameter): ''' Fits a model to the input dataset with optional parameters. Input/Parameters: datafame/dataset – input dataset, which is an instance of pyspark.sql.DataFrame config (configurasi hiperparameter) Output/Returns: fitted model(s) ''' pca = PCA(k=hiperparameter['k'], inputCol=hiperparameter['inputCol'], outputCol=hiperparameter['outputCol']) model = pca.fit(df) return model
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def execute(self): from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import PCA from pyspark.ml import Pipeline from pyspark.sql.functions import udf, col from pyspark.sql.types import ArrayType, DoubleType assert int(self.k) <= len(self.originalDF.columns), "维度需不大于样本数" # 需要将features合并到一个vector,稍后再分解 vectorAssembler = VectorAssembler(inputCols=self.columns, outputCol="features") pca = PCA(k=5, inputCol="features", outputCol='pca_features') pipeline = Pipeline(stages=[vectorAssembler, pca]) self.pipelineModel = pipeline.fit(self.originalDF) self.transformDF = self.pipelineModel.transform(self.originalDF) # 定义UDF,将vector转换成double array def to_array(col): def to_array_(v): return v.toArray().tolist() return udf(to_array_, ArrayType(DoubleType()))(col) self.transformDF = self.transformDF.withColumn("pca_features", to_array(col("pca_features"))) for i in range(5): self.transformDF = self.transformDF.withColumn("pca_" + str(i), col("pca_features")[i]) self.transformDF = self.transformDF.drop("pca_features", "features")
def pca_encoder(k, features_name, output_name): pca_model = PCA()\ .setK(k)\ .setInputCol(features_name)\ .setOutputCol(output_name) return [pca_model]
def _compute_cluster_analysis(spark_df, clusters=5): numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes)) if (len(numeric_columns) == 0): raise ValueError("The provided spark dataframe does not contain any numeric columns. " "Cannot compute cluster analysis with k-means on categorical columns. " "The numeric datatypes are: {}" \ " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) if (len(numeric_columns) == 1): raise ValueError("The provided spark dataframe does contains only one numeric column. " "Cluster analysis will filter out numeric columns and then " "use pca to reduce dataset dimension to 2 dimensions and " "then apply KMeans, this is not possible when the input data have only one numeric column." "The numeric datatypes are: {}" " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) vecAssembler = VectorAssembler(inputCols=numeric_columns, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN) spark_df_1 = vecAssembler.transform(spark_df) kmeans = KMeans(k=clusters, seed=1, maxIter=20, featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN) model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)) spark_df_2 = model.transform(spark_df_1) spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) count = spark_df_3.count() if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE: spark_df_4 = spark_df_3 else: spark_df_4 = spark_df_3.sample(True, float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count)) pca = PCA(k=2, inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN) model = pca.fit(spark_df_4) spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) spark_df_6 = spark_df_5.withColumnRenamed( constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN) spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters") return json.loads(spark_df_7.toPandas().to_json())
def view(self, data, pred): """Use PCA to reduce dimension and visualize the data""" pca = PCA(k=3, inputCol="scaled", outputCol="pca-3") model = pca.fit(data) transformed = model.transform(data) view = (transformed.select("prediction", "pca-3").withColumn( "axis", self.to_array( column("pca-3"))).select(["prediction"] + [column("axis")[i] for i in range(3)])) dataframe = view.toPandas() fig = pyplot.figure(figsize=(20, 20)) ax = fig.add_subplot(111, projection="3d") ax.scatter( dataframe.iloc[:, 1], dataframe.iloc[:, 2], dataframe.iloc[:, 3], c=dataframe.iloc[:, 0] + 2, ) pyplot.show()
def feature_engi(df): ''' The function Combine the gender, usage time, and paid usage columns into a vector, and Scales the Vectors ''' #Combine the gender, usage time, and paid usage columns into a vector assembler = VectorAssembler(inputCols=["sex", "time_gap", "chgrd"], outputCol="NumFeatures") df = assembler.transform(df) pca = PCA(k=2, inputCol="NumFeatures", outputCol="pca") # k is the number of dims model = pca.fit(df) df = model.transform(df) #Scale the Vectors scaler = StandardScaler(inputCol="pca", outputCol="features", withMean=True, withStd=False) scalerModel = scaler.fit(df) df = scalerModel.transform(df) return df pass
def rebuild_pipeline(s3_name, df): first_stages, df = prepare_data(df) objetivo, model_name, hyperparams = reverse_parse_filename(s3_name) data_types = get_data_types(df) numericals_double = [var for var in data_types["DoubleType"]] numericals_int = [var for var in data_types["IntegerType"]] features = numericals_double + numericals_int # + [var + "_one_hot" for var in strings_used] stage_assembler = VectorAssembler(inputCols=features, outputCol="assem_features") num_pca = int(hyperparams["pca"]) if num_pca > 0: stage_pca = PCA(k=num_pca, inputCol="assem_features", outputCol="features") else: stage_pca = PCA(k=8, inputCol="assem_features", outputCol="features") return df, stage_pca, stage_assembler
def pca_opreator(k): ''' do PCA on dataset inputformat:(DataFrame) Row(<feature>, <label>) outputformat:(DataFrame) Row(<feature>, <label>, <PCA_feature>) ''' # df = SQLContext.createDateFrame(RDD) model = PCA(k=k, inputCol='feature', outputCol='PCA_feature') # pca_df = model.transform(df) # model.write().overwrite().save("fault_diagnosis/models/pca.model") # pca_df.select('pca_feature').show(truncate=False) return model
def __init__(self, Movie): # Get all movies watched by all users, distinct of (ratings union of tags) self.usersMovies = Movie.usersMovies # Join with movies to get genres self.usersGenres = self.usersMovies.join(Movie.movies, 'movieId').\ select('userId', explode(split('genres', "\|").alias('genres')).alias('genre')) # All the 20 genres self.genres_str = 'Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi' # Get all users self.users = Movie.usersMovies.select('userId').distinct() # Form a template with users X genres self.usersGenresTemplate = self.users.withColumn('genres', lit(self.genres_str)).\ select('userId', explode(split('genres', "\|").alias('genres')).alias('genre')) # Fill in the template with the actual values, and zero where null self.usersGenresFilled = self.usersGenres.groupBy('userId', 'genre').agg(count('genre').alias('count')).\ join(self.usersGenresTemplate, ['userId', 'genre'], 'right').fillna(0) # Sort by Genre and form genre array and counts self.usersFeatures = self.usersGenresFilled.groupBy('userId', 'genre').agg(sum('count').alias('count_')).\ sort('genre', ascending=True).groupBy('userId').\ agg(collect_list('genre').alias('genres'), collect_list('count_').alias('count')).cache() # userGenres = self.usersFeatures.drop('genres') self.datapoints = userGenres.select( 'userId', normalizeUdf(col('count')).alias('features')) # Trains a k-means model. kmeans = KMeans(maxIter=10).setK(3).setSeed(1) self.model = kmeans.fit(self.datapoints.select('features')) # kmeans.save(data_path + "/kmeans") # PCA reduction for visual pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") self.pcaModel = pca.fit(self.datapoints.select('features'))
def PCA_transform(sc, samples_df, feature_count, threshold, k): # check input if threshold and ((threshold > 1) or (threshold < 0)): print "ERROR: PCA_transform: Input threshold should be within 0 to 1" return (None, None, None) if k and k < 0: print "ERROR: transform: Input k should be greater than 0" return (None, None, None) #print "df.shape=",df.shape #print "in ml_sklearn_PCA_transform()" df_reduced = None pca = None if not threshold is None: # by threshold =============== if feature_count > 200: fk = 200 print "INFO: force k to " + str(fk) + " for PCA." else: fk = feature_count pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) sum_ratio = 0 # get ratio array and find n_components var_arr = pca_model.explainedVariance print "RESULT: PCA ratio_vec=", var_arr n_components = ml_util.ml_get_n_components(var_arr, threshold) ''' for n_components,val in enumerate(var_arr): sum_ratio=sum_ratio+val if sum_ratio >= threshold: break ''' k = n_components #print sum_ratio, n_components df_pcaed_all = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") # get k column only sqlCtx = SQLContext(sc) df_pcaed = sqlCtx.createDataFrame( df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[ "pcaFeatures"].toArray()[:k])).map(lambda p: Row( hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2])))) print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold elif k > 0: # by n_components =============== pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) df_pcaed = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") print "INFO: PCA_transform: n_components =", k return (df_pcaed, k, pca_model)
def count_df(filename): ''' Write a Python script using DataFrames that prints the number of trees (non-header lines) in the data file passed as first argument. Test file: tests/test_count_df.py Note: The return value should be an integer ''' spark = init_spark() init_df = spark.read.option("inferSchema", "true").option("header", "true").csv(filename,header=True) cols = init_df.columns cols = cols[:-1] vecAssembler = VectorAssembler(inputCols=cols, outputCol="features") standardizer = StandardScaler(withMean=True, withStd=True,inputCol='features',outputCol='std_features') indexer = StringIndexer(inputCol="class", outputCol="label_idx") pca = PCA(k=5, inputCol="std_features", outputCol="pca") lr_pca = LogisticRegression(featuresCol='pca', labelCol='label_idx') lr_withoutpp = LogisticRegression(featuresCol='pca', labelCol='label_idx') pipeline = Pipeline(stages=[vecAssembler, standardizer, indexer, pca, lr_withoutpp]) train, test = init_df.randomSplit([0.7, 0.3]) model = pipeline.fit(train) import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore') prediction = model.transform(test) score = prediction.select(['prediction', 'label_idx']) metrics = BinaryClassificationMetrics(score.rdd) print(metrics) score.show(n=score.count()) acc = score.rdd.map(lambda x: x[1] == x[0]).sum() / score.count() print(acc)
def get_models_params_dic(): stage_pca = PCA(k=15, inputCol="scaled_features", outputCol="features") lr = LogisticRegression() lr_paramGrid = ParamGridBuilder() \ .addGrid(stage_pca.k, [1]) \ .addGrid(lr.maxIter, [1]) \ .build() dt = DecisionTreeClassifier() dt_paramGrid = ParamGridBuilder() \ .addGrid(stage_pca.k, [1]) \ .addGrid(dt.maxDepth, [2]) \ .build() paramGrid_dic = {"LR": lr_paramGrid, "DT": dt_paramGrid} model_dic = {"LR": lr, "DT": dt} return model_dic, paramGrid_dic
def pipeline_assembler_pca(Spark, trainDat, testDat, k=50): # read data train_df = Spark.read.csv(trainDat, header=False, inferSchema="true") test_df = Spark.read.csv(testDat, header=False, inferSchema="true") # assembler them assembler = VectorAssembler( inputCols=train_df.columns[1:], outputCol="features") # PCA init pca = PCA(k=k, inputCol="features", outputCol="features_pcas") # pipeline set pipeline = Pipeline(stages=[assembler, pca]) # fit model model = pipeline.fit(train_df) # transform train data train_pca_result = model.transform(train_df).select( col(train_df.columns[0]).alias("label"), "features_pcas") # transform test data test_pca_result = model.transform(test_df).select( col(test_df.columns[0]).alias("label"), "features_pcas") return train_pca_result, test_pca_result
#raw_df.saveAsTextFile("subtrain_preprocess.txt") weights = [.8, .1, .1] seed = 42 raw_train_df, raw_validation_df, raw_test_df = raw_df.randomSplit( weights, seed) def parse_point(point): feats = point.split(",")[1:] return [(idx, value) for (idx, value) in enumerate(feats)] parsedTrainFeat = raw_train_df.map(parse_point) parsedValidFeat = raw_validation_df.map(parse_point) ####### PCA data ############### pca = PCA(k=4, inputCol="features", outputCol="pcafeatures") ctrOHEDict_train = create_one_hot_dict(parsedTrainFeat) ctrOHEDict_valid = create_one_hot_dict(parsedValidFeat) numCtrOHEFeats = len(ctrOHEDict_train.keys()) def pca_data(data, OHEDict): df = sqlContext.createDataFrame(data, ["features"]) model_pca = pca.fit(df) data_pca = model_pca.transform(df).map(lambda x: parse_point(x)) numOHEFeats = len(data_pca.keys()) print(" THIS SHIT :", data_pca) return one_hot_encoding(data_pca, OHEDict, numOHEFeats)
#input rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\ map(lambda x:x.split(",")) D = 2 ** 24 def helper1(r): features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) pca = PCA(k=1000, inputCol="features", outputCol="pca_features") model_pca = pca.fit(df) rdd_pca = model_pca.transform(df).select(["label","pca_features"]) rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features') (trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3]) lr = LogisticRegression(maxIter=100, regParam=0.01) model = lr.fit(trainingData) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/lr_pca_1000_001")
withMean=True, withStd=True) # Scale the numeric columns of the data. scalerModel = scaler.fit(transformed) scaled_genres_features = scalerModel.transform(transformed).distinct() # Select the desired columns from the data. scaled_genres_features = (scaled_genres_features.select( F.col('Track_ID'), F.col('scaledFeatures').alias('features'), F.col('Genre'))) scaled_genres_features.show() # Define the principle component analysis object. We only want the top 10 features. pca = PCA(k=10, inputCol="features", outputCol="pca_features") # Fit and transform the data into PCA features. pca_model = pca.fit(scaled_genres_features) scaled_genres_features = pca_model.transform(scaled_genres_features) ########################## # Convert the genre column into a column representing if the song is "Electronic" or some other genre # as a binary label. scaled_genres_features = (scaled_genres_features.withColumn( 'label', F.when((F.col('Genre') == 'Electronic'), 1).otherwise(0))) scaled_genres_features.show(20)
cols = spark_df.drop('labclass').columns assembler = VectorAssembler(inputCols=cols, outputCol='features') labelIndexer = StringIndexer(inputCol="labclass", outputCol="indexedLabel").fit(spark_df) ## Standardize the columns scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=False, withMean=True) ## Principal component analysis pca = PCA(k=3, inputCol='scaledFeatures', outputCol='pcaFeature') (trainingData, testData) = spark_df.randomSplit([0.8, 0.2]) ## Training a RandomForest model rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="pcaFeature", numTrees=10) ## Retrieve orginal labels from indexed labels labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)
os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) lines = sc.textFile(inputpath).map(lambda x:x.split(" ")) lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF() ####Run#### pca = PCA(k=int(k),inputCol="features", outputCol="pca_features") model = pca.fit(df) outData = model.transform(df) pcaFeatures = outData.select("labels","pca_features") ####Write Out#### output_dir = inputdir + "/pca" + str(k) + "_Features" output_data = inputdir + "/pca" + str(k) + "_Data" n_data = 0 n_features = 0 if os.path.isdir(output_dir): os.system("rm -r " + output_dir) df.rdd.repartition(1).saveAsTextFile(output_dir) outputfile = open(output_data, 'w')
("Logistic regression models are neat".split(" "), ) ], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) result = model.transform(documentDF) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # COMMAND ---------- from pyspark.ml.feature import PCA pca = PCA().setInputCol("features").setK(2) pca.fit(scaleDF).transform(scaleDF).show(20, False) # COMMAND ---------- from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2) pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\
def pca(df): pca = PCA(k=10,inputCol="features", outputCol="pca_features") model = pca.fit(df) # outData = model.transform(lines) pcaFeatures = model.transform(lines).select("labels","pca_features") dfwrite(pcaFeatures,'pcaFeatures')
Feat = sc.parallelize(Feat) # map feature matrix to spark vectors from pyspark.mllib.linalg import Vectors Feat = Feat.map(lambda vec: (Vectors.dense(vec),)) ## Define a df with feature matrix from pyspark.sql import SQLContext sqlContext = SQLContext(sc) dfFeat = sqlContext.createDataFrame(Feat,["features"]) dfFeat.printSchema() ## PCA to project Feature matrix to 2 dimensions from pyspark.ml.feature import PCA numComponents = 3 pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures") model = pca.fit(dfFeat) dfComp = model.transform(dfFeat).select("pcaFeatures") # get the first two components to lists to be plotted compX = dfComp.map(lambda vec: vec[0][0]).take(maxWordsVis) compY = dfComp.map(lambda vec: vec[0][1]).take(maxWordsVis) compZ = dfComp.map(lambda vec: vec[0][2]).take(maxWordsVis) ## finish Spark session sc.stop() ## plot fs=20 #fontsize w = words[0:maxWordsVis] import matplotlib.pyplot as plt
train_df = labelIndexer.transform(train_df) test_df = labelIndexer.transform(test_df) label_mapping = dict(enumerate(labelIndexer.labels())) reverse_mapping = {} for key in label_mapping: reverse_mapping[label_mapping[key]] = key # ## Dimensionality reduction # # Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA # In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df) train_df = pca.transform(train_df) test_df = pca.transform(test_df) # ## Classification algorithms # In[ ]: rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000) #rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000) model = rf.fit(train_df) # ## Evaluation & results
def learn_pca_embedding(raw_data_frame): pca_computer = PCA(k=NBITS, inputCol='features', outputCol='pca') pca_model = pca_computer.fit(raw_data_frame) return pca_model