示例#1
0
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
             (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
             (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml PCA',
            [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
示例#2
0
def pca_model(data, k=7, inputcol='scale_features', outputcol='pca_features'):
    # new_df
    pca = PCA(k=k, inputCol=inputcol, outputCol=outputcol)
    model = pca.fit(data)
    variance = model.explainedVariance
    pca_data = model.transform(data)
    return variance, pca_data
示例#3
0
def PCA_setting(spark, rdd, n):
    df = spark.createDataFrame(rdd,schema=['features'])
    pca = PCA(k=n,inputCol='features',outputCol='pca_features')

    model = pca.fit(df)

    return model.transform(df).select('pca_features').collect()
    def _perform_pca(self, dataset: DataFrame, k: int):
        # Since we want to plot the clusters, it is important
        # downsize the dimensions to at most 3 dimensions.
        # We can use PCA with 3 principal components for this.
        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(dataset)
        rows = pca_model \
                    .transform(dataset) \
                    .select("clusterNum", "pcaFeatures") \
                    .collect()

        # Now we'll plot the clusters as a 3D scatter plot with
        # each point's color corresponding to its cluster.
        # Cast cluterNum to string so it is treated as categorical
        # data for plotting purposes.
        axes = zip(*[row["pcaFeatures"] for row in rows])
        colors  = pd.Categorical([row["clusterNum"] for row in rows])

        if k == 2:
            x, y = axes
            fig = plt.figure(figsize=(15, 15))
            sns.scatterplot(x=x, y=y, hue=colors)
        if k == 3:
            x, y, z = axes
            plot_df = pd.DataFrame({"PCA 1": x, "PCA 2": y, "PCA 3": z, "cluster": colors})
            g = sns.PairGrid(plot_df, hue="cluster", palette="coolwarm")
            g = g.map(sns.scatterplot, linewidths=0.75, edgecolor="w", s=40)
            g = g.add_legend()
            g.fig.set_size_inches(15, 15)

        # Specify number of principal components and clusters in model
        image_path = os.path.join("analysis", "results",
                                  "charts", f"pca-{k}-{self.model_name}.png")
        plt.savefig(image_path)
def _get_pca_model(feat_train, k):
    from pyspark.ml.feature import PCA
    pca = PCA(k=k, inputCol="features", outputCol="pca_features")
    pca_model = pca.fit(feat_train)
    # Explained Variance
    logr.log_event('Training Accuracy', f"{sum(pca_model.explainedVariance)}")
    return pca_model
def get_preprocessed_data(input_train, input_test):
    # Train Data
    train = spark.read.csv(input_train, header=False, inferSchema="true")
    train_labels = get_vector(train.select('_c0'), 'train_label')
    train_features = get_vector(train.drop('_c0'), 'feature')

    # Test Data
    test = spark.read.csv(input_test, header=False, inferSchema="true")
    test_labels = get_vector(test.select('_c0'), 'test_label')
    test_features = get_vector(test.drop('_c0'), 'feature')

    # Compute PCA
    pca = PCA(k=50, inputCol="feature", outputCol="pca_feature")
    pca_model = pca.fit(train_features)

    # Apply PCA to train / test features
    train_features_pca = pca_model.transform(train_features).select(
        "pca_feature")
    test_features_pca = pca_model.transform(test_features).select(
        "pca_feature")

    # Rename pca feature column values
    train_features_pca = train_features_pca.withColumnRenamed(
        "pca_feature", "train_feature")
    test_features_pca = test_features_pca.withColumnRenamed(
        "pca_feature", "test_feature")

    # Create combined train / test data
    train_data = combine_features_labels(train_features_pca, train_labels,
                                         'train')
    test_data = combine_features_labels(test_features_pca, test_labels, 'test')

    return train_data, test_data
示例#7
0
    def pca(self, df, k=1):
        cov = RowMatrix(
            df.rdd.map(lambda x: list(x))).computeCovariance().toArray()
        col = cov.shape[1]
        eigVals, eigVecs = np.linalg.eigh(cov)
        inds = np.argsort(eigVals)
        eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]]
        eigVals = eigVals[inds[-1:-(col + 1):-1]]
        components = RowMatrix(
            df.rdd.map(lambda x: list(x))).computePrincipalComponents(k)

        train_data = df.rdd.map(
            lambda x: Row(features=Vectors.dense(x))).toDF()

        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        model = pca.fit(train_data)
        score = model.transform(train_data)

        res = {
            "components":
            components.toArray(),
            "score":
            np.array(
                score.select("pcaFeatures").rdd.map(
                    lambda x: list(x[0])).collect()),
            "eigVectors":
            eigVecs,
            "eigValues":
            eigVals
        }

        return res
示例#8
0
 def __build_pca(self, df, metadata_path):
     pca = PCA(k=self.k,
               inputCol='scaled_features',
               outputCol='pca_features')
     if self.__metadata:
         pca.fit(df).write().overwrite().save(metadata_path)
         return PCAModel.load(metadata_path).transform(df)
     return pca.fit(df).transform(df)
示例#9
0
文件: views.py 项目: eason001/imPro
def pca(inputdir,df,alg,k):
        from pyspark.ml.feature import PCA
	pca = PCA(k=int(k),inputCol="features", outputCol="pca_features")
        model = pca.fit(df)
        outData = model.transform(df)
        pcaFeatures = outData.select("labels","pca_features")
	output_data = writeOut(inputdir,pcaFeatures,alg,k)
	return output_data
def pca_generic(data, dimens, input_col, output_col="pcaFeatures"):
    print('PCA Result with dimentions = ' + str(dimens) +
          ' with output column pcaFeatures')
    pca_generic = PCA(k=dimens, inputCol=input_col, outputCol=output_col)
    pca_model_generic = pca_generic.fit(data)
    result_pca_generic = pca_model_generic.transform(data)
    result_pca_generic.show()
    print('\n')
    return result_pca_generic, pca_model_generic
示例#11
0
def runPCA(vector_features, k=3):
    from pyspark.ml.feature import PCA

    #convert df to feature_vec
    feature_vec = vector_features.select('features')
    pca = PCA(k, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(feature_vec)

    result = model.transform(feature_vec).select("pcaFeatures")
    return result
示例#12
0
    def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(
            Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
                              1).alias('id'), "*")
        df_features = df_index.select('id', self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, 'id')

        left_df = df_features.select(
            df_features['id'].alias('left_id'),
            df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(
            df_features['id'].alias('right_id'),
            df_features[self.featureCol].alias('right_features'))

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df['left_id'] != right_df['right_id'])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            'left_id', 'right_id',
            cosine_similarity_udf(
                array(joined_df['left_features'],
                      joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select(
            'left_id', 'right_id',
            rank().over(
                Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(
            f.collect_list('right_id').alias('nn'))

        # generate laplacian
        laplacian = knn_grouped.select(
            knn_grouped['left_id'].alias('id'),
            toVector_udf(
                laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'],
                                     lit(n),
                                     lit(self.k_nearest))).alias('lap_vector'))

        pca = PCA(k=self.num_eigenvectors,
                  inputCol='lap_vector',
                  outputCol='features').fit(laplacian)
        eigenvectors = pca.transform(laplacian).select('id', 'features')

        model = KMeans(featuresCol='features',
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigenvectors)
        predictions = model.transform(eigenvectors).join(df_index, on='id')
        return predictions
示例#13
0
文件: project.py 项目: sam46/Yelper
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
    async def plot_cluster(self, df, x='_3', y='_4'):
        pca = PCAml(k=2, inputCol="features", outputCol="pca")
        model3 = pca.fit(df)
        transformed2 = model3.transform(df)

        def extract(row):
            return (row.customer, ) + (row.prediction, ) + tuple(
                row.pca.toArray().tolist())

        pcadf = transformed2.rdd.map(extract).toDF(["customer", "prediction"])
        pcadf.show(10, False)
        pandad = pcadf.toPandas()
        pandad.plot.scatter(x=x, y=y, c='prediction', colormap='viridis')
        plt.show()
示例#15
0
def train(df, hiperparameter):
    '''
    Fits a model to the input dataset with optional parameters.
    Input/Parameters:	
                datafame/dataset – input dataset, which is an instance of pyspark.sql.DataFrame
                config (configurasi hiperparameter)

    Output/Returns:	
                fitted model(s)
    '''
    pca = PCA(k=hiperparameter['k'],
              inputCol=hiperparameter['inputCol'],
              outputCol=hiperparameter['outputCol'])
    model = pca.fit(df)
    return model
示例#16
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
示例#17
0
    def execute(self):
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.feature import PCA
        from pyspark.ml import Pipeline
        from pyspark.sql.functions import udf, col
        from pyspark.sql.types import ArrayType, DoubleType
        assert int(self.k) <= len(self.originalDF.columns), "维度需不大于样本数"

        # 需要将features合并到一个vector,稍后再分解
        vectorAssembler = VectorAssembler(inputCols=self.columns, outputCol="features")
        pca = PCA(k=5, inputCol="features", outputCol='pca_features')
        pipeline = Pipeline(stages=[vectorAssembler, pca])
        self.pipelineModel = pipeline.fit(self.originalDF)

        self.transformDF = self.pipelineModel.transform(self.originalDF)

        # 定义UDF,将vector转换成double array
        def to_array(col):
            def to_array_(v):
                return v.toArray().tolist()

            return udf(to_array_, ArrayType(DoubleType()))(col)

        self.transformDF = self.transformDF.withColumn("pca_features", to_array(col("pca_features")))

        for i in range(5):
            self.transformDF = self.transformDF.withColumn("pca_" + str(i), col("pca_features")[i])
        self.transformDF = self.transformDF.drop("pca_features", "features")
示例#18
0
    def pca_encoder(k, features_name, output_name):
        pca_model = PCA()\
            .setK(k)\
            .setInputCol(features_name)\
            .setOutputCol(output_name)

        return [pca_model]
示例#19
0
def _compute_cluster_analysis(spark_df, clusters=5):
    numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes))
    if (len(numeric_columns) == 0):
        raise ValueError("The provided spark dataframe does not contain any numeric columns. "
                         "Cannot compute cluster analysis with k-means on categorical columns. "
                         "The numeric datatypes are: {}" \
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    if (len(numeric_columns) == 1):
        raise ValueError("The provided spark dataframe does contains only one numeric column. "
                         "Cluster analysis will filter out numeric columns and then "
                         "use pca to reduce dataset dimension to 2 dimensions and "
                         "then apply KMeans, this is not possible when the input data have only one numeric column."
                         "The numeric datatypes are: {}"
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    vecAssembler = VectorAssembler(inputCols=numeric_columns,
                                   outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)
    spark_df_1 = vecAssembler.transform(spark_df)
    kmeans = KMeans(k=clusters, seed=1, maxIter=20,
                    featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                    predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN)
    model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN))
    spark_df_2 = model.transform(spark_df_1)
    spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                                    constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    count = spark_df_3.count()
    if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE:
        spark_df_4 = spark_df_3
    else:
        spark_df_4 = spark_df_3.sample(True,
                                       float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count))

    pca = PCA(k=2,
              inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
              outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN)
    model = pca.fit(spark_df_4)
    spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
                                                     constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    spark_df_6 = spark_df_5.withColumnRenamed(
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN)
    spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters")
    return json.loads(spark_df_7.toPandas().to_json())
示例#20
0
    def view(self, data, pred):
        """Use PCA to reduce dimension and visualize the data"""
        pca = PCA(k=3, inputCol="scaled", outputCol="pca-3")
        model = pca.fit(data)
        transformed = model.transform(data)
        view = (transformed.select("prediction", "pca-3").withColumn(
            "axis", self.to_array(
                column("pca-3"))).select(["prediction"] +
                                         [column("axis")[i]
                                          for i in range(3)]))

        dataframe = view.toPandas()
        fig = pyplot.figure(figsize=(20, 20))
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(
            dataframe.iloc[:, 1],
            dataframe.iloc[:, 2],
            dataframe.iloc[:, 3],
            c=dataframe.iloc[:, 0] + 2,
        )
        pyplot.show()
示例#21
0
def feature_engi(df):
    '''
    The function Combine the gender, usage time, and paid usage columns into a vector,
    and Scales the Vectors
    '''
    #Combine the gender, usage time, and paid usage columns into a vector
    assembler = VectorAssembler(inputCols=["sex", "time_gap", "chgrd"],
                                outputCol="NumFeatures")
    df = assembler.transform(df)
    pca = PCA(k=2, inputCol="NumFeatures",
              outputCol="pca")  # k is the number of dims
    model = pca.fit(df)
    df = model.transform(df)
    #Scale the Vectors
    scaler = StandardScaler(inputCol="pca",
                            outputCol="features",
                            withMean=True,
                            withStd=False)
    scalerModel = scaler.fit(df)
    df = scalerModel.transform(df)
    return df
    pass
示例#22
0
def rebuild_pipeline(s3_name, df):

    first_stages, df = prepare_data(df)
    objetivo, model_name, hyperparams = reverse_parse_filename(s3_name)

    data_types = get_data_types(df)
    numericals_double = [var for var in data_types["DoubleType"]]
    numericals_int = [var for var in data_types["IntegerType"]]

    features = numericals_double + numericals_int
    #          + [var + "_one_hot" for var in strings_used]
    stage_assembler = VectorAssembler(inputCols=features,
                                      outputCol="assem_features")

    num_pca = int(hyperparams["pca"])
    if num_pca > 0:
        stage_pca = PCA(k=num_pca,
                        inputCol="assem_features",
                        outputCol="features")
    else:
        stage_pca = PCA(k=8, inputCol="assem_features", outputCol="features")

    return df, stage_pca, stage_assembler
示例#23
0
def pca_opreator(k):
    '''
        do PCA on dataset
        inputformat:(DataFrame)
            Row(<feature>, <label>)
        outputformat:(DataFrame)
            Row(<feature>, <label>, <PCA_feature>)
    '''
    # df = SQLContext.createDateFrame(RDD)
    model = PCA(k=k, inputCol='feature', outputCol='PCA_feature')
    # pca_df = model.transform(df)
    # model.write().overwrite().save("fault_diagnosis/models/pca.model")
    # pca_df.select('pca_feature').show(truncate=False)
    return model
示例#24
0
    def __init__(self, Movie):
        # Get all movies watched by all users, distinct of (ratings union of tags)
        self.usersMovies = Movie.usersMovies
        # Join with movies to get genres
        self.usersGenres = self.usersMovies.join(Movie.movies, 'movieId').\
            select('userId', explode(split('genres', "\|").alias('genres')).alias('genre'))

        # All the 20 genres
        self.genres_str = 'Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi'
        # Get all users
        self.users = Movie.usersMovies.select('userId').distinct()
        # Form a template with users X genres
        self.usersGenresTemplate = self.users.withColumn('genres', lit(self.genres_str)).\
            select('userId', explode(split('genres', "\|").alias('genres')).alias('genre'))

        # Fill in the template with the actual values, and zero where null
        self.usersGenresFilled = self.usersGenres.groupBy('userId', 'genre').agg(count('genre').alias('count')).\
            join(self.usersGenresTemplate, ['userId', 'genre'], 'right').fillna(0)

        # Sort by Genre and form genre array and counts
        self.usersFeatures = self.usersGenresFilled.groupBy('userId', 'genre').agg(sum('count').alias('count_')).\
            sort('genre', ascending=True).groupBy('userId').\
                agg(collect_list('genre').alias('genres'), collect_list('count_').alias('count')).cache()

        #
        userGenres = self.usersFeatures.drop('genres')
        self.datapoints = userGenres.select(
            'userId',
            normalizeUdf(col('count')).alias('features'))
        # Trains a k-means model.
        kmeans = KMeans(maxIter=10).setK(3).setSeed(1)
        self.model = kmeans.fit(self.datapoints.select('features'))
        #        kmeans.save(data_path + "/kmeans")

        # PCA reduction for visual
        pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
        self.pcaModel = pca.fit(self.datapoints.select('features'))
示例#25
0
def PCA_transform(sc, samples_df, feature_count, threshold, k):
    # check input
    if threshold and ((threshold > 1) or (threshold < 0)):
        print "ERROR: PCA_transform: Input threshold should be within 0 to 1"
        return (None, None, None)
    if k and k < 0:
        print "ERROR: transform: Input k should be greater than 0"
        return (None, None, None)
    #print "df.shape=",df.shape

    #print "in ml_sklearn_PCA_transform()"
    df_reduced = None
    pca = None
    if not threshold is None:  # by threshold ===============
        if feature_count > 200:
            fk = 200
            print "INFO: force k to " + str(fk) + " for PCA."
        else:
            fk = feature_count

        pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        sum_ratio = 0
        # get ratio array and find n_components
        var_arr = pca_model.explainedVariance
        print "RESULT: PCA ratio_vec=", var_arr

        n_components = ml_util.ml_get_n_components(var_arr, threshold)
        '''
        for n_components,val in enumerate(var_arr):
            sum_ratio=sum_ratio+val
            if sum_ratio >= threshold:
                break
        '''
        k = n_components
        #print sum_ratio, n_components

        df_pcaed_all = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        # get k column only
        sqlCtx = SQLContext(sc)
        df_pcaed = sqlCtx.createDataFrame(
            df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[
                "pcaFeatures"].toArray()[:k])).map(lambda p: Row(
                    hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2]))))
        print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold
    elif k > 0:  # by n_components  ===============
        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        df_pcaed = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        print "INFO: PCA_transform: n_components =", k

    return (df_pcaed, k, pca_model)
示例#26
0
def count_df(filename):
    '''
    Write a Python script using DataFrames that prints the number of trees
    (non-header lines) in the data file passed as first argument.
    Test file: tests/test_count_df.py
    Note: The return value should be an integer
    '''
    spark = init_spark()
    init_df = spark.read.option("inferSchema", "true").option("header", "true").csv(filename,header=True)
    cols = init_df.columns
    cols = cols[:-1]


    vecAssembler = VectorAssembler(inputCols=cols, outputCol="features")
    standardizer = StandardScaler(withMean=True, withStd=True,inputCol='features',outputCol='std_features')
    indexer = StringIndexer(inputCol="class", outputCol="label_idx")
    pca = PCA(k=5, inputCol="std_features", outputCol="pca")

    lr_pca = LogisticRegression(featuresCol='pca', labelCol='label_idx')

    lr_withoutpp = LogisticRegression(featuresCol='pca', labelCol='label_idx')

    pipeline = Pipeline(stages=[vecAssembler, standardizer, indexer, pca, lr_withoutpp])

    train, test = init_df.randomSplit([0.7, 0.3])

    model = pipeline.fit(train)

    import warnings

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        prediction = model.transform(test)
    
    score = prediction.select(['prediction', 'label_idx'])

    metrics = BinaryClassificationMetrics(score.rdd)
    print(metrics)

    score.show(n=score.count())

    acc = score.rdd.map(lambda x: x[1] == x[0]).sum() / score.count()
    print(acc)
示例#27
0
def get_models_params_dic():
    stage_pca = PCA(k=15, inputCol="scaled_features", outputCol="features")

    lr = LogisticRegression()

    lr_paramGrid = ParamGridBuilder() \
    .addGrid(stage_pca.k, [1]) \
    .addGrid(lr.maxIter, [1]) \
    .build()

    dt = DecisionTreeClassifier()

    dt_paramGrid = ParamGridBuilder() \
    .addGrid(stage_pca.k, [1]) \
    .addGrid(dt.maxDepth, [2]) \
    .build()

    paramGrid_dic = {"LR": lr_paramGrid, "DT": dt_paramGrid}
    model_dic = {"LR": lr, "DT": dt}

    return model_dic, paramGrid_dic
示例#28
0
def pipeline_assembler_pca(Spark, trainDat, testDat, k=50):
    # read data
    train_df = Spark.read.csv(trainDat, header=False, inferSchema="true")
    test_df = Spark.read.csv(testDat, header=False, inferSchema="true")

    # assembler them
    assembler = VectorAssembler(
        inputCols=train_df.columns[1:], outputCol="features")
    # PCA init
    pca = PCA(k=k, inputCol="features", outputCol="features_pcas")

    # pipeline set
    pipeline = Pipeline(stages=[assembler, pca])
    # fit model
    model = pipeline.fit(train_df)

    # transform train data
    train_pca_result = model.transform(train_df).select(
        col(train_df.columns[0]).alias("label"), "features_pcas")
    # transform test data
    test_pca_result = model.transform(test_df).select(
        col(test_df.columns[0]).alias("label"), "features_pcas")
    return train_pca_result, test_pca_result
示例#29
0
#raw_df.saveAsTextFile("subtrain_preprocess.txt")
weights = [.8, .1, .1]
seed = 42
raw_train_df, raw_validation_df, raw_test_df = raw_df.randomSplit(
    weights, seed)


def parse_point(point):
    feats = point.split(",")[1:]
    return [(idx, value) for (idx, value) in enumerate(feats)]


parsedTrainFeat = raw_train_df.map(parse_point)
parsedValidFeat = raw_validation_df.map(parse_point)
####### PCA data ###############
pca = PCA(k=4, inputCol="features", outputCol="pcafeatures")

ctrOHEDict_train = create_one_hot_dict(parsedTrainFeat)

ctrOHEDict_valid = create_one_hot_dict(parsedValidFeat)
numCtrOHEFeats = len(ctrOHEDict_train.keys())


def pca_data(data, OHEDict):
    df = sqlContext.createDataFrame(data, ["features"])
    model_pca = pca.fit(df)

    data_pca = model_pca.transform(df).map(lambda x: parse_point(x))
    numOHEFeats = len(data_pca.keys())
    print(" THIS SHIT :", data_pca)
    return one_hot_encoding(data_pca, OHEDict, numOHEFeats)
#input 
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
pca = PCA(k=1000, inputCol="features", outputCol="pca_features")
model_pca = pca.fit(df)
rdd_pca = model_pca.transform(df).select(["label","pca_features"])
rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features')
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])
lr = LogisticRegression(maxIter=100, regParam=0.01)
model = lr.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/lr_pca_1000_001")

                        withMean=True,
                        withStd=True)

# Scale the numeric columns of the data.
scalerModel = scaler.fit(transformed)
scaled_genres_features = scalerModel.transform(transformed).distinct()

# Select the desired columns from the data.
scaled_genres_features = (scaled_genres_features.select(
    F.col('Track_ID'),
    F.col('scaledFeatures').alias('features'), F.col('Genre')))

scaled_genres_features.show()

# Define the principle component analysis object. We only want the top 10 features.
pca = PCA(k=10, inputCol="features", outputCol="pca_features")

# Fit and transform the data into PCA features.
pca_model = pca.fit(scaled_genres_features)
scaled_genres_features = pca_model.transform(scaled_genres_features)

##########################

# Convert the genre column into a column representing if the song is "Electronic" or some other genre
# as a binary label.
scaled_genres_features = (scaled_genres_features.withColumn(
    'label',
    F.when((F.col('Genre') == 'Electronic'), 1).otherwise(0)))

scaled_genres_features.show(20)
cols = spark_df.drop('labclass').columns

assembler = VectorAssembler(inputCols=cols, outputCol='features')
labelIndexer = StringIndexer(inputCol="labclass",
                             outputCol="indexedLabel").fit(spark_df)

## Standardize the columns

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=False,
                        withMean=True)

## Principal component analysis

pca = PCA(k=3, inputCol='scaledFeatures', outputCol='pcaFeature')

(trainingData, testData) = spark_df.randomSplit([0.8, 0.2])

## Training a RandomForest model

rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="pcaFeature",
                            numTrees=10)

## Retrieve orginal labels from indexed labels

labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)
示例#33
0
文件: pca.py 项目: eason001/imPro
os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()






####Run####
pca = PCA(k=int(k),inputCol="features", outputCol="pca_features")
model = pca.fit(df)
outData = model.transform(df)
pcaFeatures = outData.select("labels","pca_features")

####Write Out####
output_dir = inputdir + "/pca" + str(k) + "_Features"
output_data = inputdir + "/pca" + str(k) + "_Data"
n_data = 0
n_features = 0

if os.path.isdir(output_dir):
	os.system("rm -r " + output_dir)

df.rdd.repartition(1).saveAsTextFile(output_dir)
outputfile = open(output_data, 'w')
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))


# COMMAND ----------

from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ChiSqSelector, Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn\
示例#35
0
文件: yispark.py 项目: eason001/imBot
def pca(df):
	pca = PCA(k=10,inputCol="features", outputCol="pca_features")
	model = pca.fit(df)
#	outData = model.transform(lines)
	pcaFeatures = model.transform(lines).select("labels","pca_features")
	dfwrite(pcaFeatures,'pcaFeatures')
示例#36
0
Feat = sc.parallelize(Feat) 

# map feature matrix to spark vectors
from pyspark.mllib.linalg import Vectors
Feat = Feat.map(lambda vec: (Vectors.dense(vec),))

## Define a df with feature matrix
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
dfFeat = sqlContext.createDataFrame(Feat,["features"])
dfFeat.printSchema()

## PCA to project Feature matrix to 2 dimensions
from pyspark.ml.feature import PCA
numComponents = 3
pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(dfFeat)
dfComp = model.transform(dfFeat).select("pcaFeatures")
# get the first two components to lists to be plotted

compX = dfComp.map(lambda vec: vec[0][0]).take(maxWordsVis)
compY = dfComp.map(lambda vec: vec[0][1]).take(maxWordsVis)
compZ = dfComp.map(lambda vec: vec[0][2]).take(maxWordsVis)

## finish Spark session
sc.stop()

## plot
fs=20 #fontsize
w = words[0:maxWordsVis]
import matplotlib.pyplot as plt
train_df = labelIndexer.transform(train_df)
test_df = labelIndexer.transform(test_df)

label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
    reverse_mapping[label_mapping[key]] = key


# ## Dimensionality reduction
# 
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

train_df = pca.transform(train_df)
test_df = pca.transform(test_df)


# ## Classification algorithms

# In[ ]:

rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000)
#rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000)
model = rf.fit(train_df)


# ## Evaluation & results
示例#38
0
def learn_pca_embedding(raw_data_frame):
    pca_computer = PCA(k=NBITS, inputCol='features', outputCol='pca')
    pca_model = pca_computer.fit(raw_data_frame)
    return pca_model