예제 #1
0
def get_std_scaler(labeledpoints):
    std = StandardScaler()
    train_features = labeledpoints.map(lambda lp: lp.features)

    scaler_model = std.fit(train_features)
    transformed_features = scaler_model.transform(train_features)

    transformed_label_features = \
        zip(labeledpoints.map(lambda lp: lp.label).collect(), transformed_features.collect())

    return to_labeled_points(transformed_label_features), scaler_model
예제 #2
0
    def fit(self, dataset):
        """
        Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.

        :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`

        """
        if isinstance(dataset, LabeledDataSet):
            dataset = dataset.features
        if isinstance(dataset, pyspark.rdd.RDD):
            standarizer = StdSc(self.flag_mean, self.flag_std)
            self.model = standarizer.fit(dataset)
        else:
            if type(dataset) is not np.ndarray:
                dataset = np.array(dataset)
            if self.flag_mean is True:
                self.mean = dataset.mean(axis=0)
            if self.flag_std is True:
                self.std = dataset.std(axis=0, ddof=1)
        return
예제 #3
0
파일: data.py 프로젝트: leferrad/learninspy
    def fit(self, dataset):
        """
        Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.

        :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`

        """
        if isinstance(dataset, LabeledDataSet):
            dataset = dataset.features
        if isinstance(dataset, pyspark.rdd.RDD):
            standarizer = StdSc(self.flag_mean, self.flag_std)
            self.model = standarizer.fit(dataset)
        else:
            if type(dataset) is not np.ndarray:
                dataset = np.array(dataset)
            if self.flag_mean is True:
                self.mean = dataset.mean(axis=0)
            if self.flag_std is True:
                self.std = dataset.std(axis=0, ddof=1)
        return
예제 #4
0
    def extract_features(self, feat='tfidf', **kwargs):
        """
        Converts each subtitle into its TF/TFIDF representation.
        Normalizes if necessary.

        Parameters
        --------
        Feat: 'tf' or 'tfidf'.
        kwargs: num_features, minDocFreq, or other arguments to be passed
        to the MLLib objects.

        Returns
        --------
        RDD of features with key.
        """

        # transform BOW into TF vectors
        num_features = kwargs.get('num_features', 10000)
        htf = HashingTF(num_features)
        feat_rdd = self.RDD.mapValues(htf.transform).cache()

        # transform TF vectors into IDF vectors
        if feat == 'tfidf':
            keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
            minDocFreq = kwargs.get('minDocFreq', 2)
            idf = IDF(minDocFreq=minDocFreq)
            idf_model = idf.fit(tf_vecs)
            idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(idf_rdd)

        if self.model_type == 'log_reg':
            normalizer = StandardScaler(withMean=True, withStd=True)
            keys, vecs = feat_rdd.keys(), feat_rdd.values()
            norm_model = normalizer.fit(vecs)
            norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(norm_rdd)

        return feat_rdd
weighted = ep.transform(vecrdd)

print weighted.take(1)
print vecrdd.take(1)

# call the colStats method of the Statistics object on vecrdd and print the
# mean, variance, and number of non-zero values
stats = Statistics.colStats(vecrdd)

print stats.mean()
print stats.variance()
print stats.numNonzeros()

# instantiate a StandardScaler object and set withMean and withStd to 'True'
ss = StandardScaler(withMean=True, withStd=True)

# call the fit method of the StandardScaler object to create a StandardScalerModel
model = ss.fit(vecrdd)

# call the transform method of the StandardScalerModel to center and scale the data
# in vecrdd RDD
scaled = model.transform(vecrdd)

# call colStats method of the Statistics object and print the mean, variance,
# and number of non-zero values to confirm that vecrdd was scaled and centered
scaledStats = Statistics.colStats(scaled)

print scaledStats.mean()
print scaledStats.variance()
print scaledStats.numNonzeros()
예제 #6
0
# 예제 11-9 파이썬에서 벡터 정량화

from pyspark.mllib.feature import StandardScaler

vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])]
dataset = sc.parallelize(vectors)
scaler = StandardScaler(withMean=True, withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)

# 결과: {[-0.7071, 0.7071, 0.0], [0.7071, -0.7071, 0.0])
예제 #7
0
df.show()
pdf = df.toPandas

table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))

# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)

lab = df.map(lambda row: row[0])

transformedData = lab.zip(features_transform)

transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))

trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
print linearModel.weights
예제 #8
0
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")
예제 #9
0
    raw_data = sc.textFile(data_file)

    labels = raw_data.map(lambda line: line.strip().split(",")[-1])

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print("Parsing dataset...")
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print("Standardizing data...")
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print(
        "Calculating total in within cluster distance for different k values (10 to %(max_k)d):"
        % {"max_k": max_k})
    scores = map(lambda k: clustering_score(standardized_data_values, k),
                 range(10, max_k + 1, 10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print("Best k value is %(best_k)d" % {"best_k": min_k})

    # Use the best model to assign a cluster to each datum
    # We use here standardized data - it is more appropriate for exploratory purposes
예제 #10
0
def parse_interaction(line):
    line_split = line.split(",")
    clean_line_split = [line_split[0]] + line_split[4:-1]
    return (line_split[-1], array([float(x) for x in clean_line_split]))


parsed_data = kddcup_data.map(parse_interaction)
pd_values = parsed_data.values().cache()

kdd_train = pd_values.sample(False, .75, 12345)
kdd_test = pd_values.sample(False, .25, 12345)
print("Training set feature count: " + str(kdd_train.count()))
print("Test set feature count: " + str(kdd_test.count()))

standardizer = StandardScaler(True, True)
standardizer_model = standardizer.fit(kdd_train)
data_for_cluster = standardizer_model.transform(kdd_train)

initializationMode = "random"

our_k = numpy.arange(10, 31, 10)
metrics = []


def computeError(point):
    center = clusters.centers[clusters.predict(point)]
    denseCenter = DenseVector(numpy.ndarray.tolist(center))
    return sqrt(
        sum([x**2 for x in (DenseVector(point.toArray()) - denseCenter)]))

예제 #11
0
    sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
    for label, count in sorted_labels.items():
        print label, count

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print "Parsing dataset..."
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print "Standardizing data..."
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
    scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print "Best k value is %(best_k)d" % {"best_k": min_k}

    # Use the best model to assign a cluster to each datum
    # We use here standardized data - it is more appropriate for exploratory purposes
    print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
    best_model = min(scores, key=lambda x: x[2])[1]
    cluster_assignments_sample = standardized_data_values.map(lambda datum: str(best_model.predict(datum))+","+",".join(map(str,datum))).sample(False,0.05)
예제 #12
0
파일: bdt.py 프로젝트: jgran/TaxiPredict
         .set("spark.driver.memory", "6g")
        )
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

#read in dataframe created earlier
df = utils.get_df_by_name(sqlContext, 'mydf')

#don't need all the columns
df=df.select([c for c in df.columns if c in {"pc","grid_dist","short_dist","grid_short_ratio","grid_short_avg","timeofday","dayofweek","trip_time","total_notip","pick_grid","drop_grid","pick_traffic_index","drop_traffic_index", "pick_avg_speed", "drop_avg_speed", "pick_est_time", "drop_est_time"}])

#select features and standardize
rdd=df.rdd
features=rdd.map(lambda t: (t[0],t[1],t[2],t[5],t[6],t[9],t[10],t[11],t[12],t[15],t[16]))
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)                              

#select value we want to predict
#lab = rdd.map(lambda row: row[8])#time
lab = rdd.map(lambda row: row[7])#fare
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

#split into training and testing datasets
trainingData, testingData = transformedData.randomSplit([0.9,0.1],seed=1234)

#do the training and get predictions
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=10)
predictions = model.predict(testingData.map(lambda x: x.features))
valuesAndPreds = testingData.map(lambda lp: lp.label).zip(predictions)
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#divides the length of vector
standardizer = StandardScaler(False, True)
예제 #14
0
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    #rescaledData.show(5)
    # for features_label in rescaledData.select("words", "features").take(3):
    #     print(features_label)
    #convert Sparse vector to Vector
    x = rescaledData.select(
        rescaledData['features']).map(lambda r: r[0].toArray())

    from pyspark.mllib.feature import StandardScaler
    scaler = StandardScaler(withMean=True, withStd=True)
    scalerModel = scaler.fit(x)
    normalizedData = scalerModel.transform(x)
    #print normalizedData.take(5)

    # Build the model (cluster the data)
    from pyspark.mllib.clustering import KMeans, KMeansModel
    clusterNum = 5
    clusters = KMeans.train(normalizedData,
                            clusterNum,
                            maxIterations=10,
                            initializationMode="random")
    #print clusters.centers

    clust = normalizedData.map(lambda x: clusters.predict(x))
    #clust.take(5)
예제 #15
0
    def spark_cluster_analysis_main(self):

        # 创建一个SparkContext对象,用来配置客户端,告诉Spark如何访问集群、本例以local模式运行
        conf = SparkConf().setAppName('Spark').setMaster('local')
        sc = SparkContext(conf=conf)
        
        print('模块一:读取数据文件存入RDD中')
        # 读取P2P用户数据文件,存入RDD(Spark中的基本抽象-弹性分布式数据集)
        rdd=sc.textFile(self.file_path)
        sql_context = SQLContext(sc)

        # 输出RDD总元素个数
        print('RDD中总元素个数:%s' % rdd.count())

        print('模块二:数据预处理')
        # 将RDD文件中的元素按行分割开
        rdd = rdd.map(lambda line: line.split(","))

        # 输出RDD文件前5行
        print('RDD前五行:')
        print(pd.DataFrame(rdd.take(5)))

        # 去除header行
        print('删除header')
        header = rdd.first()
        rdd = rdd.filter(lambda line:line != header)

        # 输出RDD文件前5行
        print('RDD前五行:')
        print(pd.DataFrame(rdd.take(5)))

        # 将RDD文件转成Spark DataFrame,以便后续分析
        df = rdd.map(lambda line: Row(_id = line[0], recency = line[1], bid_num=line[2],avg_bid_amt=line[3])).toDF()
        
        
        # 筛选聚类变量
        print('删除id列')
        features = df.map(lambda i: i[1:])

        # 输出RDD文件前5行
        print('RDD前五行:')
        print(pd.DataFrame(features.take(5)))

        # 进行数据标准化操作:计算每个变量值的z标准分数
        standardizer = StandardScaler(withMean=True,withStd=True)

        # 将数据转换为稠密向量格式
        features_transform = standardizer.fit(features).transform(features)
        print('输出标准化后向量格式的数据:')
        print(features_transform.take(5))
        
        print('模块三:使用Pyspark.mllib机器学习库进行聚类分析')
        print('聚类分析运行中,请等待...')
        
        # 训练k=2-10共9个聚类模型,根据Within-Cluster Sum of Square类内平方和来选择最佳聚类个数
        result=[]
        for k in range(2,11):
            model=KMeans.train(features_transform,k,maxIterations=30, runs=3, initializationMode="random")
            cost=model.computeCost(features_transform)
            result.append(cost)

        # 输出每个聚类模型的类内距离平方和,评估数据最好模型
        print('输出每个聚类模型的类内距离平方和,选择效果最佳模型')
        for k in range(2,11):
            print('Within-Cluster Sum of Square for k=%d is %d'%(k,result[k-2]))

        # 输出聚类模型碎石图,帮助选择最佳模型
        get_ipython().magic('matplotlib inline')
        plt.title('Scree-plots for Cluster Models')
        plt.plot([2,3,4,5,6,7,8,9,10],result,'r*-')

        # 训练最佳模型并分类
        model=KMeans.train(features_transform,7,maxIterations=30, runs=3, initializationMode="random")
        prediction=model.predict(features_transform).collect()
        

        print('模块四:结果输出')
        #将每个样本的所属类赋给原始数据集
        final_result=pd.concat([df.toPandas(),pd.DataFrame(prediction)],axis=1)
        final_result.rename(columns={0:'cluster'},inplace=True)
        # 转变列格式为float型
        final_result['recency']=final_result['recency'].astype(float)
        final_result['bid_num']=final_result['bid_num'].astype(float)
        final_result['avg_bid_amt']=final_result['avg_bid_amt'].astype(float)

        # 输出聚类后每个群组在3个指标上的平均值,从而区分出各类的特征
        print('聚类结果前5行')
        print(final_result.head())
        print('输出每个类在3个指标上的平均值,从而区分各类特征')
        print(final_result.groupby('cluster').mean())
        sc.stop()