Пример #1
0
    def normalizer(self):
        """
        This function normalize the training data
  
        """
  
        if self._typeNorm == 'norm':
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=True).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            RDD_Y = self._data.map(lambda x: x[0])
            RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
        else:
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=False).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            if self._typeMVA == 'PCA':
                RDD_Y = self._data.map(lambda x: x[0])
                RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
            else:
                RDD_Y_norm = self._data.map(lambda x: x[0])

        # Create a new RDD of LabeledPoint data using the normalized features
        self._normdata = RDD_Y_norm.zip(RDD_X_norm)
Пример #2
0
def prepare_data(sc):
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    raw_data_with_header = sc.textFile(os.path.join(PATH, 'data/train.tsv'))
    header = raw_data_with_header.first()
    raw_data = raw_data_with_header.filter(lambda x: x!=header)

    # 去除 "" 按 \t 划分一个网页的不同字段
    lines_rdd = raw_data.\
        map(lambda x: x.replace("\"", "")).\
        map(lambda x: x.split('\t'))
    
    print("共计: {}项".format(lines_rdd.count()))
    #---------------------2.数据标准化----------------------- 
    # {新闻类别: 序号, }
    categories_map = lines_rdd.map(lambda fields: fields[3]).\
                        distinct().zipWithIndex().collectAsMap()
    label_rdd = lines_rdd.map(lambda r: get_label(r))
    features_rdd = lines_rdd.map(lambda r: get_features(r, categories_map, len(r)-1))


    scaler = StandardScaler(withMean=True, withStd=True).fit(features_rdd)
    stand_features = scaler.transform(features_rdd)
    #----------3.建立训练评估所需数据 RDD[LabeledPoint]-------   LabeledPoint                    
    labeledpoint_rdd = label_rdd.zip(stand_features).map(lambda r: LabeledPoint(r[0], r[1]))
    #-----------4.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labeledpoint_rdd.randomSplit([0.8, 0.1, 0.1])
    print("将数据分trainData: {0}, validationData: {1}, testData: {2}".format(
        trainData.count(), validationData.count(), testData.count()
    ))

    return (trainData, validationData, testData, categories_map) #返回数据
Пример #3
0
def prepareData(sc):

    print 'import training data'

    rawDataWithHeader = sc.textFile(Path + 'train.tsv')
    print rawDataWithHeader.take(10)
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x:x != header)
    rData = rawData.map(lambda x: x.replace("\"",""))
    lines = rData.map(lambda x: x.split("\t"))
    print lines.count()

    categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
    print categoriesMap
    labelRDD = lines.map(lambda r: extractLabel(r))
    featureRDD = lines.map(lambda r: extractFeatures(r,categoriesMap,len(r)-1))
    # print featureRDD.take(1)
    stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    # print ScalerFeatureRDD.take(1)
    labelPoint = labelRDD.zip(ScalerFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0],r[1]))
    # print labelPointRDD.take(1)
    (trainData, testData, validationData) = labelPointRDD.randomSplit([8, 1, 1])
    print trainData.count()
    print testData.count()
    print validationData.count()
    return (trainData, testData, validationData, categoriesMap)
Пример #4
0
def PrepareData(sc): 
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")
    #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]-------------
    print "标准化之前:",        
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r:  extract_label(r))
    featureRDD = lines.map(lambda r:  extract_features(r,categoriesMap,len(r) - 1))
    for i in featureRDD.first():
        print (str(i)+","),
    print ""       
    
    print "标准化之后:",    
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD=stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print (str(i)+","),        
                
    labelpoint=labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    
    #----------------------3.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData:" + str(trainData.count()) + 
              "   validationData:" + str(validationData.count()) +
              "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap) #返回数据
Пример #5
0
def PrepareData(sc):
    print("开始导入数据。。。")
    path = Path + "train.tsv"
    print(path)
    # 使用minPartitions=40,将数据分成40片,不然报错
    rawDataWithHeader = sc.textFile(path, minPartitions=40)
    header = rawDataWithHeader.first()
    # 去掉首行,标题
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    # 去掉引号
    rData = rawData.map(lambda x: x.replace("\"", ""))
    # 按照制表符分字段
    lines = rData.map(lambda x: x.split("\t"))
    print("总共有:", str(lines.count()))
    #----2。创建训练所需的RDD数据
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extractFeatures(r, categoriesMap,
                                                     len(r) - 1))
    print(featureRDD.first())
    #----3.随机分成3部分数据返回
    print("数据标准化之后===:")
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scalerFeatureRDD = stdScaler.transform(featureRDD)
    print(scalerFeatureRDD.first())
    labelPoint = labelRDD.zip(scalerFeatureRDD)
    labelpointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("数据集划分为:trainData:", str(trainData.count()), "validationData:",
          str(validationData.count()), "testData:", str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)
def PrepareData(sc):
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共計:" + str(lines.count()) + "筆")
    #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
    print "標準化之前:",
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ","),
    print ""
    print "標準化之後:",
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print(str(i) + ","),
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    #----------------------3.以隨機方式將資料分為3部份並且回傳-------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("將資料分trainData:" + str(trainData.count()) + "   validationData:" +
          str(validationData.count()) + "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)  #回傳資料
Пример #7
0
def PrepareData(sc):
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("total " + str(lines.count()))
    print("=======before standare========")
    categoriesMap = lines.map(lambda fields: fields[3]) \
        .distinct() \
        .zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ", ")
    print("=======after standare========")
    stdScale = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scaleFeatureRDD = stdScale.transform(featureRDD)
    for i in scaleFeatureRDD.first():
        print(str(i) + ",")
    labelPoint = labelRDD.zip(scaleFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelPointRDD.randomSplit([8, 1, 1])
    return (trainData, validationData, testData, categoriesMap)
Пример #8
0
 def test_model_setters(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([-1.0, -1.0, -1.0]))
Пример #9
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Пример #10
0
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))
Пример #11
0
def getScaledData(data):
    features = data.map(lambda x: x.features)
    label = data.map(lambda x: x.label)
    scaler = StandardScaler(withMean=True, withStd=True).fit(features)
    scaled = label\
     .zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))\
     .map(lambda x: LabeledPoint(x[0], x[1]))

    return scaled
Пример #12
0
    def norm_train(self, train_data):
        train_features = train_data.map(lambda lp: lp.features)
        self.normalizer = StandardScaler().fit(train_features)

        # TODO: This can't be efficient...
        #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1]))
        labels = train_data.map(lambda lp: lp.label).collect()
        features = self.norm(train_features).collect()
        return get_df(zip(
            labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1]))
Пример #13
0
def get_std_scaler(labeledpoints):
    std = StandardScaler()
    train_features = labeledpoints.map(lambda lp: lp.features)

    scaler_model = std.fit(train_features)
    transformed_features = scaler_model.transform(train_features)

    transformed_label_features = \
        zip(labeledpoints.map(lambda lp: lp.label).collect(), transformed_features.collect())

    return to_labeled_points(transformed_label_features), scaler_model
Пример #14
0
    def __init__(self):
        Dataset.__init__(self)

        # preparing the Data (Train and Test) : formatting and scaling then making it an RDD of LabeledPoints

        trainDirectory = HDFS_DIRECTORY + 'checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))
        ''' this block is for testing '''
Пример #15
0
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'striatum_train_mini.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.strip().split(' ')[:-1])
        labels = train.map(lambda _: _.strip().split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))

        testDirectory = HDFS_DIRECTORY + 'striatum_test_mini.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])

        # AN ISSUE HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        # in original LAL code they scaled testset with the scaler fitted from TRAINING set, but why?

        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))
Пример #16
0
def TrainLRModel(trainData, iterations, step,
                 miniBatchFraction):  # Logistic Regression
    srcFeatures = trainData.map(lambda line: line.features)
    print srcFeatures.first()
    scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures)
    srcLabel = trainData.map(lambda line: line.label)
    scaledFeature = scaler.transform(srcFeatures)
    print scaledFeature.first()
    scaledData = srcLabel.zip(scaledFeature)
    trainData = scaledData.map(
        lambda (label, features): LabeledPoint(label, features))
    model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \
                                            miniBatchFraction = miniBatchFraction)
    return model
Пример #17
0
def training(model_directory, libsvm, scaler):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    training_rdd = MLUtils.loadLibSVMFile(sc, libsvm)
    training_rdd.cache()
    if scaler == '1':
        label = training_rdd.map(lambda x: x.label)
        features = training_rdd.map(lambda x: x.features)

        scaler1 = StandardScaler().fit(features)
        data1 = label.zip(scaler1.transform(features))
        # convert into labeled point
        data2 = data1.map(lambda x: LabeledPoint(x[0], x[1]))
        model_logistic = LogisticRegressionWithLBFGS.train(data2)
    else:
        model_logistic = LogisticRegressionWithLBFGS.train(training_rdd)
    model_logistic.save(sc, model_directory)
Пример #18
0
def PrepareData(sc):
    '''
    准备数据
    :param sc:
    :return: (trainData, validationData, testData, categoriesMap)
    '''
    print('======================= 准备数据 =======================')
    # ----------------------------- 1. 导入并转换数据 -----------------------------
    print('========== [PrepareData] >>>> 开始导入 train.tsv 数据....')
    rawDataWithHeader = sc.textFile(Path + u'data/stumbleupon/train-100.tsv')
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace('\"', ''))
    lines = rData.map(lambda x: x.split('\t'))
    print('========== [PrepareData] >>>> 共计:' + str(lines.count()) + ' 项')
    # ----------------------------- 2. 建立训练评估所需数据RDD[LabeledPoint] -----------------------------
    # categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    # labelpointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1)))
    print('========== [PrepareData] >>>> 标准化之前:'),
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    print('')
    print('========== [PrepareData] >>>> 标准化之后:'),
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(
        featureRDD
    )  # 创建标准化刻度,由于数值特征字段单位不同而数字差异很大,故无法比较,因此需要标准化处理。这里不使用平均值密集输出,使用稀疏数据,因此设置withMean=False
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    labelpoint = labelRDD.zip(
        ScalerFeatureRDD)  # 使用zip将label与标准化后的特征字段结合起来建立labelpoint
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    # ----------------------------- 3. 以随机方式将数据分为3个部分并返回 -----------------------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print('========== [PrepareData] >>>> 将数据以随机方式差分为三个部分:trainData: ' +
          str(trainData.count()) + ' 项, validationData: ' +
          str(validationData.count()) + ' 项, testData: ' +
          str(testData.count()) + ' 项')
    # ----------------------------- 4. 返回元组数据 -----------------------------
    return (trainData, validationData, testData, categoriesMap)
def PrepareData(sc):
    #---------------------1. 导入并转换数据---------------------
    global Path
    if sc.master[:5] == "local" or sc.master[:5] == "spark":
        Path = "file:/Users/johnnie/pythonwork/workspace/PythonProject/data/"
    else:
        Path = "hdfs://localhost:9000/user/hduser/test/data/"

    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path + "train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")

    #---------------------2. 建立训练评估所需数据RDD[LabeledPoint]---------------------
    print("标准化之前:")
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    print(featureRDD.first())
    print("\n")
    print("标准化之后:")
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    print(ScalerFeatureRDD.first())
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    # r[0]是label
    # r[1]是features
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))

    #---------------------3. 以随机方式将数据分为3个部分并返回---------------------
    trainData, validationData, testData = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData: " + str(trainData.count()) + " validationData: " +
          str(validationData.count()) + " testData: " + str(testData.count()))

    return trainData, validationData, testData, categoriesMap
# print the top line of each RDD to confirm that the transformation was successful
weighted = ep.transform(vecrdd)

print weighted.take(1)
print vecrdd.take(1)

# call the colStats method of the Statistics object on vecrdd and print the
# mean, variance, and number of non-zero values
stats = Statistics.colStats(vecrdd)

print stats.mean()
print stats.variance()
print stats.numNonzeros()

# instantiate a StandardScaler object and set withMean and withStd to 'True'
ss = StandardScaler(withMean=True, withStd=True)

# call the fit method of the StandardScaler object to create a StandardScalerModel
model = ss.fit(vecrdd)

# call the transform method of the StandardScalerModel to center and scale the data
# in vecrdd RDD
scaled = model.transform(vecrdd)

# call colStats method of the Statistics object and print the mean, variance,
# and number of non-zero values to confirm that vecrdd was scaled and centered
scaledStats = Statistics.colStats(scaled)

print scaledStats.mean()
print scaledStats.variance()
print scaledStats.numNonzeros()
Пример #21
0
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # Without converting the features into dense vectors, transformation with zero mean will raise
    # exception on sparse vector.
    # data2 will be unit variance and zero mean.
    data2 = label.zip(
        scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)
Пример #22
0
    sc = SparkContext(conf=conf)
    sc.setLogLevel("warn")
    user_map = load_user_map(sc)
    # 加载训练数据
    train_data = load_train_data(sc)
    # 设置数据的用户信息数据
    train_data_user_info = set_train_user_info(train_data, user_map)
    # user_id  merchant_id age_range gender label
    train_data_user_info.cache()
    stand_train_data_user_info = train_data_user_info.map(
        lambda user: user[0:4])
    stand_train_data_user_info_label = train_data_user_info.map(
        lambda user: user[4])

    #训练数据标准化
    std_scaler = StandardScaler(True, True).fit(stand_train_data_user_info)
    stand_train_data_user_info = std_scaler.transform(
        stand_train_data_user_info)

    train_data_user_info = stand_train_data_user_info_label.zip(
        stand_train_data_user_info)
    # 构建标签数据
    train_data_user_info = build_point(train_data_user_info)
    numIterations = 100

    train_data_user_info.cache()
    #训练模型
    model = SVMWithSGD.train(train_data_user_info, numIterations)
    #model = DecisionTree.trainClassifier(train_data_user_info,numIterations,2,{})

    # 加载测试数据
Пример #23
0
    parts = line.strip().split("::")
    return (int(parts[0]) - 1, int(parts[1]) - 1, float(parts[2]))


#load in input file
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean=False, withStd=True).fit(
    features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(
    scaler.transform(features))  #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect()) / num_folds
                 )  #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
Пример #24
0
print(model.predict(array([8.0, 0.0])))

#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")
Пример #25
0
#
# Ok, reload the data
#
rdd_loaded = sc.pickleFile(
    'hdfs://br156-161.ifremer.fr:8020/tmp/venthsalia_hdp/rdd.pkl')
rdd_loaded = rdd_loaded.cache()
rdd_loaded.count()
rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x))
print rdd_b.count()
print rdd_b.take(1)

#
# Profiles standardisation
#
new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b)
print type(new_scalar)
scaler3 = new_scalar.transform(rdd_b)

#
# Profiles compression with PCA
#
model = PCAmllib(10).fit(scaler3)
print type(model)
transformed = model.transform(scaler3)
print type(transformed)
print transformed.count()
print transformed.first()

#
# Train a Profiles classification model with KMean
Пример #26
0
    print("Loading RAW data...")
    raw_data = sc.textFile(data_file)

    labels = raw_data.map(lambda line: line.strip().split(",")[-1])

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print("Parsing dataset...")
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print("Standardizing data...")
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print(
        "Calculating total in within cluster distance for different k values (10 to %(max_k)d):"
        % {"max_k": max_k})
    scores = map(lambda k: clustering_score(standardized_data_values, k),
                 range(10, max_k + 1, 10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print("Best k value is %(best_k)d" % {"best_k": min_k})

    # Use the best model to assign a cluster to each datum
Пример #27
0
## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while 
## uncorrelated Uniform needs a rotation to get there. By removing correlation
## in the gaussian case, we have achieved independence between variables.
## If the source variables are gaussian ICA is not required and PCA is sufficient.
    
    
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
Пример #28
0
# Size:
shape = reader.shape(flist[0])
shape['n_samples_per_file'] = shape['n_samples']
shape['n_samples'] = shape['n_samples'] * len(flist)
print "Will load a dataset of size:\n\t", shape

rdd_data = sc.parallelize(flist).flatMap(reader('TEMP'))
first = rdd_data.first()

# In[Scaling]:

# Compute scaling parameters:
from pyspark.mllib.feature import StandardScaler, StandardScalerModel

scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data)

sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)
Пример #29
0

def extract_label(fields):
    label = fields[-1]
    return label


from pyspark.mllib.regression import LabeledPoint

# labelPointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1)))

labelRDD = lines.map(lambda r: extract_label(r))
featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, -1))
from pyspark.mllib.feature import StandardScaler

stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)
labelPoint = labelRDD.zip(ScalerFeatureRDD)
labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))

trainData, validationData, testData = labelPointRDD.randomSplit([8, 1, 1])

# temporary save data into memory to speed up the later process
trainData.persist()
validationData.persist()
testData.persist()

# train model
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainData,
Пример #30
0

if __name__ == "__main__":

    conf = SparkConf()
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="MNIST_KMEANS", conf=conf)

    data = sc.textFile('train.csv')  # ingest the comma delimited file
    header = data.first()  # extract header
    data = data.filter(lambda x: x != header)  # remove the header
    trainingData = data.map(parsePoint)  # parse file to generate an RDD
    trainingData_wo_labels = trainingData.map(lambda x: x[1])  # remove label

    # normalize vector
    scaler = StandardScaler(withMean=True,
                            withStd=True).fit(trainingData_wo_labels)
    trainingData_wo_labels = scaler.transform(trainingData_wo_labels)

    model = KMeans.train(trainingData_wo_labels,
                         10,
                         maxIterations=250,
                         initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = model.centers[model.predict(
            point)]  # get centroid for cluster
        return math.sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = trainingData_wo_labels.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)