Exemplo n.º 1
0
 def __build_pca(self, df, metadata_path):
     pca = PCA(k=self.k,
               inputCol='scaled_features',
               outputCol='pca_features')
     if self.__metadata:
         pca.fit(df).write().overwrite().save(metadata_path)
         return PCAModel.load(metadata_path).transform(df)
     return pca.fit(df).transform(df)
Exemplo n.º 2
0
def PCA_transform(sc, samples_df, feature_count, threshold, k):
    # check input
    if threshold and ((threshold > 1) or (threshold < 0)):
        print "ERROR: PCA_transform: Input threshold should be within 0 to 1"
        return (None, None, None)
    if k and k < 0:
        print "ERROR: transform: Input k should be greater than 0"
        return (None, None, None)
    #print "df.shape=",df.shape

    #print "in ml_sklearn_PCA_transform()"
    df_reduced = None
    pca = None
    if not threshold is None:  # by threshold ===============
        if feature_count > 200:
            fk = 200
            print "INFO: force k to " + str(fk) + " for PCA."
        else:
            fk = feature_count

        pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        sum_ratio = 0
        # get ratio array and find n_components
        var_arr = pca_model.explainedVariance
        print "RESULT: PCA ratio_vec=", var_arr

        n_components = ml_util.ml_get_n_components(var_arr, threshold)
        '''
        for n_components,val in enumerate(var_arr):
            sum_ratio=sum_ratio+val
            if sum_ratio >= threshold:
                break
        '''
        k = n_components
        #print sum_ratio, n_components

        df_pcaed_all = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        # get k column only
        sqlCtx = SQLContext(sc)
        df_pcaed = sqlCtx.createDataFrame(
            df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[
                "pcaFeatures"].toArray()[:k])).map(lambda p: Row(
                    hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2]))))
        print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold
    elif k > 0:  # by n_components  ===============
        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        df_pcaed = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        print "INFO: PCA_transform: n_components =", k

    return (df_pcaed, k, pca_model)
Exemplo n.º 3
0
    def _perform_pca(self, dataset: DataFrame, k: int):
        # Since we want to plot the clusters, it is important
        # downsize the dimensions to at most 3 dimensions.
        # We can use PCA with 3 principal components for this.
        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(dataset)
        rows = pca_model \
                    .transform(dataset) \
                    .select("clusterNum", "pcaFeatures") \
                    .collect()

        # Now we'll plot the clusters as a 3D scatter plot with
        # each point's color corresponding to its cluster.
        # Cast cluterNum to string so it is treated as categorical
        # data for plotting purposes.
        axes = zip(*[row["pcaFeatures"] for row in rows])
        colors  = pd.Categorical([row["clusterNum"] for row in rows])

        if k == 2:
            x, y = axes
            fig = plt.figure(figsize=(15, 15))
            sns.scatterplot(x=x, y=y, hue=colors)
        if k == 3:
            x, y, z = axes
            plot_df = pd.DataFrame({"PCA 1": x, "PCA 2": y, "PCA 3": z, "cluster": colors})
            g = sns.PairGrid(plot_df, hue="cluster", palette="coolwarm")
            g = g.map(sns.scatterplot, linewidths=0.75, edgecolor="w", s=40)
            g = g.add_legend()
            g.fig.set_size_inches(15, 15)

        # Specify number of principal components and clusters in model
        image_path = os.path.join("analysis", "results",
                                  "charts", f"pca-{k}-{self.model_name}.png")
        plt.savefig(image_path)
def get_preprocessed_data(input_train, input_test):
    # Train Data
    train = spark.read.csv(input_train, header=False, inferSchema="true")
    train_labels = get_vector(train.select('_c0'), 'train_label')
    train_features = get_vector(train.drop('_c0'), 'feature')

    # Test Data
    test = spark.read.csv(input_test, header=False, inferSchema="true")
    test_labels = get_vector(test.select('_c0'), 'test_label')
    test_features = get_vector(test.drop('_c0'), 'feature')

    # Compute PCA
    pca = PCA(k=50, inputCol="feature", outputCol="pca_feature")
    pca_model = pca.fit(train_features)

    # Apply PCA to train / test features
    train_features_pca = pca_model.transform(train_features).select(
        "pca_feature")
    test_features_pca = pca_model.transform(test_features).select(
        "pca_feature")

    # Rename pca feature column values
    train_features_pca = train_features_pca.withColumnRenamed(
        "pca_feature", "train_feature")
    test_features_pca = test_features_pca.withColumnRenamed(
        "pca_feature", "test_feature")

    # Create combined train / test data
    train_data = combine_features_labels(train_features_pca, train_labels,
                                         'train')
    test_data = combine_features_labels(test_features_pca, test_labels, 'test')

    return train_data, test_data
Exemplo n.º 5
0
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
             (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
             (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml PCA',
            [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Exemplo n.º 6
0
def pca_model(data, k=7, inputcol='scale_features', outputcol='pca_features'):
    # new_df
    pca = PCA(k=k, inputCol=inputcol, outputCol=outputcol)
    model = pca.fit(data)
    variance = model.explainedVariance
    pca_data = model.transform(data)
    return variance, pca_data
Exemplo n.º 7
0
    def pca(self, df, k=1):
        cov = RowMatrix(
            df.rdd.map(lambda x: list(x))).computeCovariance().toArray()
        col = cov.shape[1]
        eigVals, eigVecs = np.linalg.eigh(cov)
        inds = np.argsort(eigVals)
        eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]]
        eigVals = eigVals[inds[-1:-(col + 1):-1]]
        components = RowMatrix(
            df.rdd.map(lambda x: list(x))).computePrincipalComponents(k)

        train_data = df.rdd.map(
            lambda x: Row(features=Vectors.dense(x))).toDF()

        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        model = pca.fit(train_data)
        score = model.transform(train_data)

        res = {
            "components":
            components.toArray(),
            "score":
            np.array(
                score.select("pcaFeatures").rdd.map(
                    lambda x: list(x[0])).collect()),
            "eigVectors":
            eigVecs,
            "eigValues":
            eigVals
        }

        return res
def _get_pca_model(feat_train, k):
    from pyspark.ml.feature import PCA
    pca = PCA(k=k, inputCol="features", outputCol="pca_features")
    pca_model = pca.fit(feat_train)
    # Explained Variance
    logr.log_event('Training Accuracy', f"{sum(pca_model.explainedVariance)}")
    return pca_model
Exemplo n.º 9
0
def PCA_setting(spark, rdd, n):
    df = spark.createDataFrame(rdd,schema=['features'])
    pca = PCA(k=n,inputCol='features',outputCol='pca_features')

    model = pca.fit(df)

    return model.transform(df).select('pca_features').collect()
Exemplo n.º 10
0
def pca(inputdir,df,alg,k):
        from pyspark.ml.feature import PCA
	pca = PCA(k=int(k),inputCol="features", outputCol="pca_features")
        model = pca.fit(df)
        outData = model.transform(df)
        pcaFeatures = outData.select("labels","pca_features")
	output_data = writeOut(inputdir,pcaFeatures,alg,k)
	return output_data
def pca_generic(data, dimens, input_col, output_col="pcaFeatures"):
    print('PCA Result with dimentions = ' + str(dimens) +
          ' with output column pcaFeatures')
    pca_generic = PCA(k=dimens, inputCol=input_col, outputCol=output_col)
    pca_model_generic = pca_generic.fit(data)
    result_pca_generic = pca_model_generic.transform(data)
    result_pca_generic.show()
    print('\n')
    return result_pca_generic, pca_model_generic
Exemplo n.º 12
0
def runPCA(vector_features, k=3):
    from pyspark.ml.feature import PCA

    #convert df to feature_vec
    feature_vec = vector_features.select('features')
    pca = PCA(k, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(feature_vec)

    result = model.transform(feature_vec).select("pcaFeatures")
    return result
Exemplo n.º 13
0
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
    async def plot_cluster(self, df, x='_3', y='_4'):
        pca = PCAml(k=2, inputCol="features", outputCol="pca")
        model3 = pca.fit(df)
        transformed2 = model3.transform(df)

        def extract(row):
            return (row.customer, ) + (row.prediction, ) + tuple(
                row.pca.toArray().tolist())

        pcadf = transformed2.rdd.map(extract).toDF(["customer", "prediction"])
        pcadf.show(10, False)
        pandad = pcadf.toPandas()
        pandad.plot.scatter(x=x, y=y, c='prediction', colormap='viridis')
        plt.show()
Exemplo n.º 15
0
def train(df, hiperparameter):
    '''
    Fits a model to the input dataset with optional parameters.
    Input/Parameters:	
                datafame/dataset – input dataset, which is an instance of pyspark.sql.DataFrame
                config (configurasi hiperparameter)

    Output/Returns:	
                fitted model(s)
    '''
    pca = PCA(k=hiperparameter['k'],
              inputCol=hiperparameter['inputCol'],
              outputCol=hiperparameter['outputCol'])
    model = pca.fit(df)
    return model
Exemplo n.º 16
0
def _compute_cluster_analysis(spark_df, clusters=5):
    numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes))
    if (len(numeric_columns) == 0):
        raise ValueError("The provided spark dataframe does not contain any numeric columns. "
                         "Cannot compute cluster analysis with k-means on categorical columns. "
                         "The numeric datatypes are: {}" \
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    if (len(numeric_columns) == 1):
        raise ValueError("The provided spark dataframe does contains only one numeric column. "
                         "Cluster analysis will filter out numeric columns and then "
                         "use pca to reduce dataset dimension to 2 dimensions and "
                         "then apply KMeans, this is not possible when the input data have only one numeric column."
                         "The numeric datatypes are: {}"
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    vecAssembler = VectorAssembler(inputCols=numeric_columns,
                                   outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)
    spark_df_1 = vecAssembler.transform(spark_df)
    kmeans = KMeans(k=clusters, seed=1, maxIter=20,
                    featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                    predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN)
    model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN))
    spark_df_2 = model.transform(spark_df_1)
    spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                                    constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    count = spark_df_3.count()
    if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE:
        spark_df_4 = spark_df_3
    else:
        spark_df_4 = spark_df_3.sample(True,
                                       float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count))

    pca = PCA(k=2,
              inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
              outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN)
    model = pca.fit(spark_df_4)
    spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
                                                     constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    spark_df_6 = spark_df_5.withColumnRenamed(
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN)
    spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters")
    return json.loads(spark_df_7.toPandas().to_json())
Exemplo n.º 17
0
    def view(self, data, pred):
        """Use PCA to reduce dimension and visualize the data"""
        pca = PCA(k=3, inputCol="scaled", outputCol="pca-3")
        model = pca.fit(data)
        transformed = model.transform(data)
        view = (transformed.select("prediction", "pca-3").withColumn(
            "axis", self.to_array(
                column("pca-3"))).select(["prediction"] +
                                         [column("axis")[i]
                                          for i in range(3)]))

        dataframe = view.toPandas()
        fig = pyplot.figure(figsize=(20, 20))
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(
            dataframe.iloc[:, 1],
            dataframe.iloc[:, 2],
            dataframe.iloc[:, 3],
            c=dataframe.iloc[:, 0] + 2,
        )
        pyplot.show()
Exemplo n.º 18
0
def feature_engi(df):
    '''
    The function Combine the gender, usage time, and paid usage columns into a vector,
    and Scales the Vectors
    '''
    #Combine the gender, usage time, and paid usage columns into a vector
    assembler = VectorAssembler(inputCols=["sex", "time_gap", "chgrd"],
                                outputCol="NumFeatures")
    df = assembler.transform(df)
    pca = PCA(k=2, inputCol="NumFeatures",
              outputCol="pca")  # k is the number of dims
    model = pca.fit(df)
    df = model.transform(df)
    #Scale the Vectors
    scaler = StandardScaler(inputCol="pca",
                            outputCol="features",
                            withMean=True,
                            withStd=False)
    scalerModel = scaler.fit(df)
    df = scalerModel.transform(df)
    return df
    pass
Exemplo n.º 19
0
    def __init__(self, Movie):
        # Get all movies watched by all users, distinct of (ratings union of tags)
        self.usersMovies = Movie.usersMovies
        # Join with movies to get genres
        self.usersGenres = self.usersMovies.join(Movie.movies, 'movieId').\
            select('userId', explode(split('genres', "\|").alias('genres')).alias('genre'))

        # All the 20 genres
        self.genres_str = 'Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi'
        # Get all users
        self.users = Movie.usersMovies.select('userId').distinct()
        # Form a template with users X genres
        self.usersGenresTemplate = self.users.withColumn('genres', lit(self.genres_str)).\
            select('userId', explode(split('genres', "\|").alias('genres')).alias('genre'))

        # Fill in the template with the actual values, and zero where null
        self.usersGenresFilled = self.usersGenres.groupBy('userId', 'genre').agg(count('genre').alias('count')).\
            join(self.usersGenresTemplate, ['userId', 'genre'], 'right').fillna(0)

        # Sort by Genre and form genre array and counts
        self.usersFeatures = self.usersGenresFilled.groupBy('userId', 'genre').agg(sum('count').alias('count_')).\
            sort('genre', ascending=True).groupBy('userId').\
                agg(collect_list('genre').alias('genres'), collect_list('count_').alias('count')).cache()

        #
        userGenres = self.usersFeatures.drop('genres')
        self.datapoints = userGenres.select(
            'userId',
            normalizeUdf(col('count')).alias('features'))
        # Trains a k-means model.
        kmeans = KMeans(maxIter=10).setK(3).setSeed(1)
        self.model = kmeans.fit(self.datapoints.select('features'))
        #        kmeans.save(data_path + "/kmeans")

        # PCA reduction for visual
        pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
        self.pcaModel = pca.fit(self.datapoints.select('features'))
Exemplo n.º 20
0
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="text",
                    outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

# COMMAND ----------

from pyspark.ml.feature import PCA

pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion

pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol(
    "polyFeatures")
pe.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer

tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
Exemplo n.º 21
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 25 20:48:04 2017

@author: vishal
"""

from __future__ import print_function

from pyspark.sql import SparkSession

session = SparkSession.builder.appName('PCA').getOrCreate()

from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]

data_frame = session.createDataFrame(data, ['features'])
#data_frame.show()

from pyspark.ml.feature import PCA

pca = PCA(inputCol='features', outputCol="pca_feature", k=3)
model = pca.fit(data_frame)
pca_df = model.transform(data_frame)

pca_df.show()

session.stop()
print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

# COMMAND ----------

###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

# COMMAND ----------

###Polynomial expansion is a process of expanding features in polynomial dimensions. This example expand the given features into 3 degree polynomial dimension
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                            (Vectors.dense([0.0, 0.0]), ),
                            (Vectors.dense([3.0, -1.0]), )], ["features"])

polyExpansion = PolynomialExpansion(degree=3,
Exemplo n.º 23
0
plt.show()

# ### Principal Components Analysis
# Performs an orthogonal transformation to convert a set of possibly correlated variables into a set of values of linearly uncorrelated variables called <b>principal components</b>
# * the pcaTransformer will extract the principal components from the features
# * the number of components is set by the value of <b>k</b>

# In[23]:

from pyspark.ml.feature import PCA

pca = PCA(k=8, inputCol='features', outputCol='pcaFeatures')

# In[24]:

pcaTransformer = pca.fit(vectorDF)

# #### View the principal components in the transformed space

# In[25]:

pcaFeatureData = pcaTransformer.transform(vectorDF).select('pcaFeatures')

pcaFeatureData.toPandas().head()

# #### The principal components are stored as a DenseVector

# In[26]:

pcaFeatureData.toPandas()['pcaFeatures'][0]
Exemplo n.º 24
0
def learn_pca_embedding(raw_data_frame):
    pca_computer = PCA(k=NBITS, inputCol='features', outputCol='pca')
    pca_model = pca_computer.fit(raw_data_frame)
    return pca_model
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))


# COMMAND ----------

from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
Exemplo n.º 26
0
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from redisai import save_sparkml
from redisai import onnx_utils
from redisai import DType

executable = sys.executable
os.environ["SPARK_HOME"] = pyspark.__path__[0]
os.environ["PYSPARK_PYTHON"] = executable
os.environ["PYSPARK_DRIVER_PYTHON"] = executable
spark = SparkSession.builder.appName("redisai_trial").getOrCreate()

data = spark.createDataFrame([(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
                              (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
                              (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )],
                             ["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(data)

feature_count = data.first()[0].size
N = data.count()

featurestype = onnx_utils.get_tensortype(node_name='features',
                                         dtype=DType.float32,
                                         shape=(N, feature_count))
save_sparkml(model,
             'spark.onnx',
             initial_types=[featurestype],
             spark_session=spark)
Exemplo n.º 27
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PCAExample")\
        .getOrCreate()

    if (len(sys.argv) != 3):
        print("bin/spark-submit pca-pyspark.py <data_set.csv> <param_K>")
        sys.exit(1)

    input = spark.read.load(sys.argv[1],
                            format="csv",
                            inferSchema="true",
                            header="false")
    K = int(sys.argv[2])

    assembler = VectorAssembler(inputCols=input.columns, outputCol="features")

    dataset = assembler.transform(input)
    dataset.show()

    pca = PCA(k=K, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(dataset)

    print("Principal Components: ", model.pc, sep='\n')
    print("Explained Variance: ", model.explainedVariance, sep='\n')

    spark.stop()
Exemplo n.º 28
0
#need to first register dataframe as a SQL temporary view in order to use spark sql
from pyspark.sql.functions import desc

df_with_distance.createOrReplaceTempView("df")

result_df = df_with_distance.sort(desc('distance')).limit(num_outliar)
cols = list(set(result_df.columns)-{'scaledFeatures'}-{'features'})
result_df = result_df.select(cols)

#write to dumbo local:
result_df.toPandas().to_csv(filename+"_numeric_data_result.csv", sep=',')

#write to hdfs
#result_df.write.csv("numeric_data_result.csv", sep=',')

######################
#pca:
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="scaledFeatures", outputCol="pcaFeatures")
#pca_model = pca.fit(final_data)
pca_model = pca.fit(df_with_distance)
#result_pca = pca_model.transform(final_data).select('pcaFeatures','prediction')
result_pca = pca_model.transform(df_with_distance).select('pcaFeatures','prediction','distance')

#will download to dumbo local
result_pca.toPandas().to_csv(filename+"_pca_result.csv") 



Exemplo n.º 29
0
count1 = test1_df.filter(" prediction!=Occupancy").count()
total1 = test1_df.count()

count2 = test2_df.filter(" prediction!=Occupancy").count()
total2 = test2_df.count()

total = total1 + total2
tc = count1 + count2
ans = float(tc) / float(total)
print(ans)

#### Convert to PCA ####

pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")

pcamodel = pca.fit(train_df)

# Features of Data Set 1

pca_ds1_features = test1_df.select("features", "prediction")

# Features of Data Set 2

pca_ds2_features = test2_df.select("features", "prediction")

# Transform Data

pca_ds1_features = pcamodel.transform(pca_ds1_features)

pca_ds2_features = pcamodel.transform(pca_ds2_features)
Exemplo n.º 30
0
    Vectors.dense([inStr[1],inStr[2],inStr[3], \
        inStr[4],inStr[5],inStr[6],inStr[7], \
        inStr[8],inStr[9],inStr[10]
        ]))
    return lp


bankLp = bankVectors.map(transformToLabeledPoint)
bankLp.collect()
bankDF = sqlContext.createDataFrame(bankLp, ["label", "features"])
bankDF.select("label", "features").show(10)

#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures")
pcaResult.show(truncate=False)

#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()
testData.collect()
Exemplo n.º 31
0
def pca(df):
	pca = PCA(k=10,inputCol="features", outputCol="pca_features")
	model = pca.fit(df)
#	outData = model.transform(lines)
	pcaFeatures = model.transform(lines).select("labels","pca_features")
	dfwrite(pcaFeatures,'pcaFeatures')
# Scale the numeric columns of the data.
scalerModel = scaler.fit(transformed)
scaled_genres_features = scalerModel.transform(transformed).distinct()

# Select the desired columns from the data.
scaled_genres_features = (scaled_genres_features.select(
    F.col('Track_ID'),
    F.col('scaledFeatures').alias('features'), F.col('Genre')))

scaled_genres_features.show()

# Define the principle component analysis object. We only want the top 10 features.
pca = PCA(k=10, inputCol="features", outputCol="pca_features")

# Fit and transform the data into PCA features.
pca_model = pca.fit(scaled_genres_features)
scaled_genres_features = pca_model.transform(scaled_genres_features)

##########################

# Convert the genre column into a column representing if the song is "Electronic" or some other genre
# as a binary label.
scaled_genres_features = (scaled_genres_features.withColumn(
    'label',
    F.when((F.col('Genre') == 'Electronic'), 1).otherwise(0)))

scaled_genres_features.show(20)

# Show the class balance of the binary label.
(scaled_genres_features.groupBy('label').agg(F.count(F.col('label'))).select(
    F.col('label'),
Exemplo n.º 33
0
train_df = spark.read.csv(train_datafile,header=False,inferSchema="true")

# transfer the test_df in to the dataframe with 2 column label and features so that we can do the further process
assembler_test = VectorAssembler(inputCols = test_df.columns[1:],outputCol="features")
test_vectors_withlabel = assembler_test.transform(test_df).selectExpr("_c0 as label","features")

# transfer the train_df in to the dataframe with 2 column label and features so that we can do the further process
assembler_train = VectorAssembler(inputCols = train_df.columns[1:],outputCol="features")
train_vectors_withlabel = assembler_train.transform(train_df).selectExpr("_c0 as label","features")

# fit the pca of train_vector first
# we set the k=200, so that we can keep 90% data of MINST
# After fit process, we can get the model of pca_200. 
# Therefore, we can use the model to transform the test and train data.
pca = PCA(k=10, inputCol="features", outputCol="pca200")
model_200 = pca.fit(train_vectors_withlabel)
pca_train_result = model_200.transform(train_vectors_withlabel).select('label','pca200')
pca_test_result = model_200.transform(test_vectors_withlabel).select('label','pca200')


# transfer the dataframe into rdd
test_rdd = pca_test_result.rdd
train_rdd = pca_train_result.rdd


# create the broadcast, so that every singe cluster can use it
trainbc = spark.sparkContext.broadcast(train_rdd.collect())
# give the k to KNN and set the broadcast of k
k=5
kbc = spark.sparkContext.broadcast(k)
Exemplo n.º 34
0
os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()






####Run####
pca = PCA(k=int(k),inputCol="features", outputCol="pca_features")
model = pca.fit(df)
outData = model.transform(df)
pcaFeatures = outData.select("labels","pca_features")

####Write Out####
output_dir = inputdir + "/pca" + str(k) + "_Features"
output_data = inputdir + "/pca" + str(k) + "_Data"
n_data = 0
n_features = 0

if os.path.isdir(output_dir):
	os.system("rm -r " + output_dir)

df.rdd.repartition(1).saveAsTextFile(output_dir)
outputfile = open(output_data, 'w')
inputfile = open(output_dir + '/part-00000', 'r')
Exemplo n.º 35
0
# map feature matrix to spark vectors
from pyspark.mllib.linalg import Vectors
Feat = Feat.map(lambda vec: (Vectors.dense(vec),))

## Define a df with feature matrix
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
dfFeat = sqlContext.createDataFrame(Feat,["features"])
dfFeat.printSchema()

## PCA to project Feature matrix to 2 dimensions
from pyspark.ml.feature import PCA
numComponents = 3
pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(dfFeat)
dfComp = model.transform(dfFeat).select("pcaFeatures")
# get the first two components to lists to be plotted

compX = dfComp.map(lambda vec: vec[0][0]).take(maxWordsVis)
compY = dfComp.map(lambda vec: vec[0][1]).take(maxWordsVis)
compZ = dfComp.map(lambda vec: vec[0][2]).take(maxWordsVis)

## finish Spark session
sc.stop()

## plot
fs=20 #fontsize
w = words[0:maxWordsVis]
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
Exemplo n.º 36
0
#input 
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
pca = PCA(k=1000, inputCol="features", outputCol="pca_features")
model_pca = pca.fit(df)
rdd_pca = model_pca.transform(df).select(["label","pca_features"])
rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features')
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])
lr = LogisticRegression(maxIter=100, regParam=0.01)
model = lr.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/lr_pca_1000_001")

Exemplo n.º 37
0
num_train_samples = 60000

test_df = spark.read.csv(test_datafile, header=False, inferSchema="true")
train_df = spark.read.csv(train_datafile, header=False, inferSchema="true")

# Formatting the Dataframe
assembler = VectorAssembler(inputCols=test_df.columns[1:],
                            outputCol="features")
test_vectors = assembler.transform(test_df).select(test_df[0].alias('label'),
                                                   "features").repartition(16)
train_vectors = assembler.transform(train_df).select(
    train_df[0].alias('label'), "features").repartition(16)

# PCA implementing
pca = PCA(k=PCA_D, inputCol='features', outputCol='pca')
model = pca.fit(test_vectors)
train_data = model.transform(train_vectors).select('label', 'pca')
test_data = model.transform(test_vectors).select('label', 'pca')

# KNN Data Preprocessing

# ONE time collect() function
train_matrix = []
train_label = []
train_rows = train_data.rdd.collect()
for i in train_rows:
    train_matrix.append(i.pca)
    train_label.append(i.label)

train_label = sc.broadcast(np.array(train_label))
train_matrix = sc.broadcast(np.array(train_matrix))