예제 #1
0
def PCAdata(df, num):
    Label = df.map(lambda p: p.label).zipWithIndex().map(lambda (label, index):
                                                         (index, label))
    Features = df.map(lambda p: p.features)
    pcaModel = PCA(num).fit(Features)
    projected = pcaModel.transform(Features)
    second = projected.zipWithIndex().map(lambda (features, index):
                                          (index, features))
    result = Label.join(second).map(
        lambda (idx, (label, features)): LabeledPoint(label, features))
    return result
예제 #2
0
def run_pca(sc):
    cpu_count = multiprocessing.cpu_count()
    cluster_loss = dict()

    for n in range(0, CLUSTERS):
        filename = "cluster_" + str(n) + ".csv"
        cl_file = CLUSTER_PATH + filename
        dataset = sc.textFile(cl_file, cpu_count)
        dataset = dataset.map(
            lambda line: Vectors.dense([float(x) for x in line.split(';')]))

        model = PCA(2).fit(dataset)
        transformed = model.transform(dataset)
        transformed_csv = transformed.map(
            lambda x: ';'.join(list(map(str, x))))
        transformed_csv.coalesce(1).saveAsTextFile(PCA_PATH +
                                                   "onehot_%s" % filename)
def pca_fit(parsed_Data):
    x = parsed_Data.map(lambda p: p.features)
    pc = PCA(5).fit(x)
    transformed = pc.transform(x)
    y = parsed_Data.map(lambda p: p.label)
    a = transformed.zip(y)
    paired = a.map(lambda line: LabeledPoint(line[1], line[0]))

    rdd2 = paired.randomSplit([0.8, 0.2])
    model2 = LinearRegressionWithSGD.train(rdd2[0], iterations=100,
                                           step=0.00000001, regType=None)

    # Evaluate the model on training data
    valuesAndPreds = rdd2[1].map(lambda p: (p.label, model2.predict(p.features)))
    MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
def preprocess(sc, files_rdd, labeled_spectra, cut, label=True, **kwargs):
    """
    :param path: A path to the input data. It should be a directory containing the votable or FITS files.
    :param labeled_path: A path to the CSV file with spectra already labeled. These shall be resampled so that
     the have the same resolution as the unlabeled spectra. They shall undergo the same preprocessing as the rest.
    :param label: Set to False if you want to omit the label from the output.
    :return: A RDD DataFrame with the preprocessed spectra. It shall contain the labeled spectra at the beginning,
    followed by the unlabeled, whose label shall be set to -1. In case you *label* was set to False, the output
    shall not contain any label and the ordering shall be arbitrary.
    """
    logger.info("Starting preprocessing")
    # TODO support archives
    cut_low = cut['low']
    cut_high = cut['high']
    spectra = files_rdd.map(lambda x: parse_spectra_file(x[0], x[1], cut_low, cut_high)).filter(lambda x: x is not None).cache()
    low, high = spectra.union(labeled_spectra.map(lambda x: x.drop(x.columns[-1], axis=1))).aggregate((0.0, sys.float_info.max), high_low_op, high_low_comb)
    mean_step = spectra.map(lambda x: x.columns[1] - x.columns[0]).mean()
    logger.debug("low %f high %f %f", low, high, mean_step)
    resampled_header = sc.broadcast(np.arange(low, high, mean_step))
    spectra = spectra.map(lambda x: resample(x, resampled_header_broadcast=resampled_header))
    if label:
        spectra = spectra.map(lambda x: x.assign(label=pd.Series([-1], index=x.index)))

    if labeled_spectra is not None:
        spectra = labeled_spectra.map(lambda x: resample(x, resampled_header_broadcast=resampled_header, label_col="label" if label else None,
                                                         convolve=True)).union(spectra).repartition(kwargs.get("partitions", 100))

    if kwargs.get('pca') is not None:
        namesByRow = spectra.zipWithIndex().map(lambda s: (s[1], (s[0].index, s[0]['label'].iloc[0])) if label else (s[1], s[0].index))
        logger.info("Doing PCA")
        pca_params = kwargs['pca']
        k = pca_params.get("k", 10)
        pca = PCA(k)
        fitted_pca = pca.fit(spectra.map(lambda x: Vectors.dense(x[x.columns[:-1]].iloc[0].values)))
        transformed_spectra = fitted_pca.transform(spectra.
                                       map(lambda x: transform_pca(x))).zipWithIndex(). \
            map(lambda x: (x[1], x[0])).join(namesByRow)
        spectra = transformed_spectra.map(lambda x: pd.DataFrame(data=[x[1][0].tolist() +
                                       ([x[1][1][1]] if label else [])],
                                       index=x[1][1][0] if label else x[2],
                                       columns=range(k) + ['label'] if label else range(k)))

    return resampled_header.value, spectra.sortBy(lambda x: x['label'].values[0], ascending=False,
                                                  numPartitions=kwargs.get("partitions", 100))
예제 #5
0
#data = sc.textFile("iris_data.txt") #for master local or standalone model

data = sc.textFile(
    "hdfs://master:9000/root/pyspark_test/iris_data.txt")  #for hadoop yarn
parsedData = data.map(lambda line: array([x for x in line.split(',')]))

first_data = parsedData.take(1)[0]
data_row = len(first_data)  #include many input and one output attributes

params_only = parsedData.map(
    lambda x: Vectors.dense(np.float_(x[0:(data_row - 1)])))
#params_only.take(5)
#the type of params_only is pyspark.rdd.PipelinedRDD
#params_only=parsedData.map(lambda x: array(np.float_(x[0:(data_row-1)])))

model_test = PCAmllib(2).fit(params_only)
transformed = model_test.transform(params_only)
#transformed.collect()

pca_2d = transformed.collect()


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


k = 3
clusters = KMeans.train(params_only,
                        k,
예제 #6
0
파일: pca.py 프로젝트: kendazheng/sparkml
# -*- coding:utf-8 -*-
""""
Program: PCA
Description: 调用spark内置的PCA算法
Author: zhenglei - [email protected]
Date: 2016-01-14 13:45:02
# Last modified: 2016-01-28 19:23:14
Python release: 2.7
"""
# 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.feature import PCA
from pyspark.mllib.linalg import Vectors

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('pcaTestSet.txt')
    datas = tmpdatas.map(lambda line: Vectors.dense(
        array([float(line.split('\t')[0]), float(line.split('\t')[1])])))
    print datas.collect()[0]

    # 将输入降维成1维数据,并测试降维模型的准确性
    model = PCA(1).fit(datas)
    transforms = model.transform(datas)
    print transforms.collect()[0], array(transforms.collect()).shape

    # 测试输入[10.235186,11.321997]之后的降维值
    print model.transform(array([10.235186, 11.321997]))
    sc.stop()
예제 #7
0
k_param=int('%%k%%')
modelpath = ascontext.createTemporaryFolder()
mbr = ModelBuildReporter(sc)

# create a DataModelTools to handle data model and data conversions
dmt = DataModelTools()

# compute the data model from the dataframe
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields
datamodel = dmt.computeDataModel(df.select(*predictors))
# use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors
lp = dmt.extractDenseVector(df,predictors,setToFlag=1.0).map(lambda x:x[1]).cache()

# build the PCA Model
from pyspark.mllib.feature import PCA
estimator = PCA(k_param)
pcamodel = estimator.fit(lp)

# extract the model coefficients by creating dummy data with each row containing
# a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5)
coefficients = []
n_predictors = len(predictors)
for i in range(0,k_param):
    coefficients.append(list([0.0]*n_predictors))

for c in range(0,len(predictors)):
    vec = [0.0]*len(predictors)
    vec[c] = 1.0
    arr = pcamodel.transform(DenseVector(vec)).toArray()
    for i in range(0,k_param):
        coefficients[i][c] = arr[i]
예제 #8
0
    #
    #
    # '''
    # The A.cartesian(B) will be an RDD of the form:
    # [(A ID1, A String1), (A ID2, A String2), ...]  and  [(B ID1, B String1), (B ID2, B String2), ...]
    # to:
    # [ ((A ID1, A String1), (B ID1, B String1)), ((A ID1, A String1), (B ID2, B String2)), ((A URL2, A String2), (B ID1, B String1)), ... ]¶
    # '''
    # cross_RDD = ID_tokens.cartesian(ID_tokens).cache()
    # # commonTokens:  [[id1, id2], [tokens]]
    # commonTokens = cross_RDD.map(get_common)
    # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache()
    #
    # end = time.time()
    # print 'total prepare: '+ str(end - start)
    # print similarities_RDD.count()
    # c_time = time.time()
    # print 'count time: ' + str(c_time - end)
    # similarities_RDD.collect()
    # c2_time = time.time()
    # print 'count time: ' + str(c2_time - c_time)
    # print 'Successfully Calculated the similarities between all the posts'


if __name__ == '__main__':
    sc = SparkContext('local')
    tfidf_matrix = create_tfidf(sc)
    tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row))
    reduc = PCA(3).fit(tfidf_dVector_matrix)
    after_pca = reduc.transform(tfidf_dVector_matrix)
예제 #9
0
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.
model = PCAmllib(2).fit(tfidf)
pc = model.transform(tfidf)

#mat = RowMatrix(tfidf)
# Calculate PCA
#pc = mat.computePrincipalComponents(int(mat.numCols))

print("Principal components :")
print(pc)
예제 #10
0
from pyspark.mllib.feature import StandardScaler, StandardScalerModel

scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data)

sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)

# Effectively reduce the dataset:
rdd_reduced = reducer.transform(rdd_norm)
# print type(rdd_reduced)

# In[Classification with k-mean]:

### Lancement de KMean pour creation du modele de classification
from pyspark.mllib.clustering import KMeans as KMeansmllib
import time
start_time = time.time()

NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
예제 #11
0
result_collection["review_counts_helpful%"] = review_counts_hp

# Get average overall rating per review length (char count)
review_length_cc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][4], [x[1][2]]))
review_length_wc, = averages_per_key(reviewer_vectors, lambda x:
                                     (x[1][6], [x[1][2]]))

result_collection["review_length_char_count"] = review_length_cc
result_collection["review_length_word_count"] = review_length_wc

# Conduct PCA
reviewer_vectors_real = reviewer_vectors.map(
    lambda x: Vectors.dense([val for val in x[1]]))

pca_model = PCA(8).fit(reviewer_vectors_real)
transformed = pca_model.transform(reviewer_vectors_real)

current_best = None
current_best_cost = float("inf")

# Run K-Means
for k in range(2, 70, 7):
    kmeans_model = KMeans.train(transformed, k, maxIterations=100, runs=10)

    cost = kmeans_model.computeCost(transformed)

    if cost < current_best_cost:
        current_best_cost = cost
        current_best = kmeans_model
예제 #12
0
    # '''
    # cross_RDD = ID_tokens.cartesian(ID_tokens).cache()
    # # commonTokens:  [[id1, id2], [tokens]]
    # commonTokens = cross_RDD.map(get_common)
    # similarities_RDD = commonTokens.map(fastCosineSimilarity).cache()
    #
    # end = time.time()
    # print 'total prepare: '+ str(end - start)
    # print similarities_RDD.count()
    # c_time = time.time()
    # print 'count time: ' + str(c_time - end)
    # similarities_RDD.collect()
    # c2_time = time.time()
    # print 'count time: ' + str(c2_time - c_time)
    # print 'Successfully Calculated the similarities between all the posts'


if __name__ == '__main__':
    conf = SparkConf()
    conf.set("spark.executor.memory", "16g")
    conf.set("spark.driver.memory","16g")
    conf.set("spark.driver.maxResultSize","16g")
    sc = SparkContext(conf=conf)
    tfidf_matrix = create_tfidf(sc)
    tfidf_dVector_matrix = tfidf_matrix.map(lambda row: Vectors.dense(row))
    start2 = time.time()
    model = PCA(20).fit(tfidf_dVector_matrix)
    end2 = time.time()
    print (end2 - start2)
    after_pca = model.transform(tfidf_dVector_matrix).collect
    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...')
    tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' +
                                str(feature_dim))
    print('done!')

    print('Computing TF-IDF MODEL...')
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    print('done!')

    # APPLYING PCA ON TRAINING DATA
    if pca_mode.value == 1:
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)
예제 #14
0
def reduceDimensions(features_rdd):
    model = PCAmllib(2).fit(features_rdd)
    transformed_rdd = model.transform(features_rdd)
    return transformed_rdd
예제 #15
0
    m_source_list=[key[0], key[1], key[2]] + \
           [v[1] for v in vals] + \
           [v[2] for v in vals] + \
           [v[3] for v in vals] + \
           [v[4] for v in vals] + \
           [v[5] for v in vals]
    return Vectors.dense(m_source_list)


# COMMAND ----------

m_file_name = '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010'
lines = sc.textFile(m_file_name, minPartitions=4)
newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow)

# COMMAND ----------

t = newrows.first()
print type(t), t

# COMMAND ----------

from pyspark.mllib.feature import PCA as PCAmllib

model = PCAmllib(2).fit(newrows)
transformed = model.transform(newrows)

# COMMAND ----------

t = transformed.first()
print type(t), t
예제 #16
0
rdd_loaded.count()
rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x))
print rdd_b.count()
print rdd_b.take(1)

#
# Profiles standardisation
#
new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b)
print type(new_scalar)
scaler3 = new_scalar.transform(rdd_b)

#
# Profiles compression with PCA
#
model = PCAmllib(10).fit(scaler3)
print type(model)
transformed = model.transform(scaler3)
print type(transformed)
print transformed.count()
print transformed.first()

#
# Train a Profiles classification model with KMean
#
NBCLUSTERS = 8
INITMODE = 'kmean||'  # kmean|| or random
clusters = mllibKMeans.train(transformed,
                             NBCLUSTERS,
                             maxIterations=100,
                             initializationMode=INITMODE)
예제 #17
0
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  # 100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.
model = PCAmllib(2).fit(tfidf)
pc = model.transform(tfidf)

# mat = RowMatrix(tfidf)
# Calculate PCA
# pc = mat.computePrincipalComponents(int(mat.numCols))

print("Principal components :")
print(pc)
예제 #18
0
    m_source_list=[key[0], key[1], key[2]] + \
           [v[1] for v in vals] + \
           [v[2] for v in vals] + \
           [v[3] for v in vals] + \
           [v[4] for v in vals] + \
           [v[5] for v in vals]
    return Vectors.dense(m_source_list)

# COMMAND ----------

m_file_name= '/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010'
lines = sc.textFile(m_file_name, minPartitions=4)
newrows = lines.flatMap(parseInfo).groupByKey().map(buildRow)

# COMMAND ----------

t= newrows.first()
print type(t), t

# COMMAND ----------

from pyspark.mllib.feature import PCA as PCAmllib

model = PCAmllib(2).fit(newrows)
transformed = model.transform(newrows)

# COMMAND ----------

t= transformed.first()
print type(t), t
예제 #19
0
mbr = ModelBuildReporter(sc)

# create a DataModelTools to handle data model and data conversions
dmt = DataModelTools()

# compute the data model from the dataframe
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields
datamodel = dmt.computeDataModel(df.select(*predictors))
# use DataModelTools to convert from DataFrame to an RDD of DenseVector for specified predictors
lp = dmt.extractDenseVector(df, predictors,
                            setToFlag=1.0).map(lambda x: x[1]).cache()

# build the PCA Model
from pyspark.mllib.feature import PCA

estimator = PCA(k_param)
pcamodel = estimator.fit(lp)

# extract the model coefficients by creating dummy data with each row containing
# a predictor set to 1 and all others set to 0 (python wrapper does not seem to provide direct access in Spark 1.5)
coefficients = []
n_predictors = len(predictors)
for i in range(0, k_param):
    coefficients.append(list([0.0] * n_predictors))

for c in range(0, len(predictors)):
    vec = [0.0] * len(predictors)
    vec[c] = 1.0
    arr = pcamodel.transform(DenseVector(vec)).toArray()
    for i in range(0, k_param):
        coefficients[i][c] = arr[i]
예제 #20
0
파일: pca.py 프로젝트: 0xqq/sparkml-1
""""
Program: PCA
Description: 调用spark内置的PCA算法
Author: zhenglei - [email protected]
Date: 2016-01-14 13:45:02
# Last modified: 2016-01-28 19:23:14
Python release: 2.7
"""
# 调用spark内置的pca算法对机器学习实战中的第十三章数据集进行降维处理
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.feature import PCA
from pyspark.mllib.linalg import Vectors

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('pcaTestSet.txt')
    datas = tmpdatas.map(lambda line: Vectors.dense(
        array([float(line.split('\t')[0]),
               float(line.split('\t')[1])])))
    print datas.collect()[0]

    # 将输入降维成1维数据,并测试降维模型的准确性
    model = PCA(1).fit(datas)
    transforms = model.transform(datas)
    print transforms.collect()[0], array(transforms.collect()).shape

    # 测试输入[10.235186,11.321997]之后的降维值
    print model.transform(array([10.235186, 11.321997]))
    sc.stop()
예제 #21
0
def reduceDimensions(features_rdd):
	model = PCAmllib(2).fit(features_rdd)
	transformed_rdd = model.transform(features_rdd)
	return transformed_rdd
예제 #22
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("linearSVC Example")\
        .getOrCreate()

    # $example on$
    # Load training data
    inputData = spark.read.format("libsvm") \
        .load("combined_data_svm.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    pca = PCAml(k=2, inputCol="features", outputCol="pca")
    model = PCAmllib(2).fit(train)
    transform = model.transform(train)
    predictions = model.inverse_transform(test)

    # score the model on test data.
    #predictions = lsvcModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on test data.
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    # $example off$