Пример #1
0
def prepareData(sc):

    print 'import training data'

    rawDataWithHeader = sc.textFile(Path + 'train.tsv')
    print rawDataWithHeader.take(10)
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x:x != header)
    rData = rawData.map(lambda x: x.replace("\"",""))
    lines = rData.map(lambda x: x.split("\t"))
    print lines.count()

    categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
    print categoriesMap
    labelRDD = lines.map(lambda r: extractLabel(r))
    featureRDD = lines.map(lambda r: extractFeatures(r,categoriesMap,len(r)-1))
    # print featureRDD.take(1)
    stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    # print ScalerFeatureRDD.take(1)
    labelPoint = labelRDD.zip(ScalerFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0],r[1]))
    # print labelPointRDD.take(1)
    (trainData, testData, validationData) = labelPointRDD.randomSplit([8, 1, 1])
    print trainData.count()
    print testData.count()
    print validationData.count()
    return (trainData, testData, validationData, categoriesMap)
Пример #2
0
 def test_model_setters(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([-1.0, -1.0, -1.0]))
Пример #3
0
def PrepareData(sc): 
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")
    #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]-------------
    print "标准化之前:",        
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r:  extract_label(r))
    featureRDD = lines.map(lambda r:  extract_features(r,categoriesMap,len(r) - 1))
    for i in featureRDD.first():
        print (str(i)+","),
    print ""       
    
    print "标准化之后:",    
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD=stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print (str(i)+","),        
                
    labelpoint=labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    
    #----------------------3.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData:" + str(trainData.count()) + 
              "   validationData:" + str(validationData.count()) +
              "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap) #返回数据
Пример #4
0
def prepare_data(sc):
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    raw_data_with_header = sc.textFile(os.path.join(PATH, 'data/train.tsv'))
    header = raw_data_with_header.first()
    raw_data = raw_data_with_header.filter(lambda x: x!=header)

    # 去除 "" 按 \t 划分一个网页的不同字段
    lines_rdd = raw_data.\
        map(lambda x: x.replace("\"", "")).\
        map(lambda x: x.split('\t'))
    
    print("共计: {}项".format(lines_rdd.count()))
    #---------------------2.数据标准化----------------------- 
    # {新闻类别: 序号, }
    categories_map = lines_rdd.map(lambda fields: fields[3]).\
                        distinct().zipWithIndex().collectAsMap()
    label_rdd = lines_rdd.map(lambda r: get_label(r))
    features_rdd = lines_rdd.map(lambda r: get_features(r, categories_map, len(r)-1))


    scaler = StandardScaler(withMean=True, withStd=True).fit(features_rdd)
    stand_features = scaler.transform(features_rdd)
    #----------3.建立训练评估所需数据 RDD[LabeledPoint]-------   LabeledPoint                    
    labeledpoint_rdd = label_rdd.zip(stand_features).map(lambda r: LabeledPoint(r[0], r[1]))
    #-----------4.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labeledpoint_rdd.randomSplit([0.8, 0.1, 0.1])
    print("将数据分trainData: {0}, validationData: {1}, testData: {2}".format(
        trainData.count(), validationData.count(), testData.count()
    ))

    return (trainData, validationData, testData, categories_map) #返回数据
def PrepareData(sc):
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共計:" + str(lines.count()) + "筆")
    #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
    print "標準化之前:",
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ","),
    print ""
    print "標準化之後:",
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print(str(i) + ","),
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    #----------------------3.以隨機方式將資料分為3部份並且回傳-------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("將資料分trainData:" + str(trainData.count()) + "   validationData:" +
          str(validationData.count()) + "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)  #回傳資料
Пример #6
0
def PrepareData(sc):
    print("开始导入数据。。。")
    path = Path + "train.tsv"
    print(path)
    # 使用minPartitions=40,将数据分成40片,不然报错
    rawDataWithHeader = sc.textFile(path, minPartitions=40)
    header = rawDataWithHeader.first()
    # 去掉首行,标题
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    # 去掉引号
    rData = rawData.map(lambda x: x.replace("\"", ""))
    # 按照制表符分字段
    lines = rData.map(lambda x: x.split("\t"))
    print("总共有:", str(lines.count()))
    #----2。创建训练所需的RDD数据
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extractFeatures(r, categoriesMap,
                                                     len(r) - 1))
    print(featureRDD.first())
    #----3.随机分成3部分数据返回
    print("数据标准化之后===:")
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scalerFeatureRDD = stdScaler.transform(featureRDD)
    print(scalerFeatureRDD.first())
    labelPoint = labelRDD.zip(scalerFeatureRDD)
    labelpointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("数据集划分为:trainData:", str(trainData.count()), "validationData:",
          str(validationData.count()), "testData:", str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)
Пример #7
0
def PrepareData(sc):
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("total " + str(lines.count()))
    print("=======before standare========")
    categoriesMap = lines.map(lambda fields: fields[3]) \
        .distinct() \
        .zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ", ")
    print("=======after standare========")
    stdScale = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scaleFeatureRDD = stdScale.transform(featureRDD)
    for i in scaleFeatureRDD.first():
        print(str(i) + ",")
    labelPoint = labelRDD.zip(scaleFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelPointRDD.randomSplit([8, 1, 1])
    return (trainData, validationData, testData, categoriesMap)
Пример #8
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Пример #9
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Пример #10
0
def getScaledData(data):
    features = data.map(lambda x: x.features)
    label = data.map(lambda x: x.label)
    scaler = StandardScaler(withMean=True, withStd=True).fit(features)
    scaled = label\
     .zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))\
     .map(lambda x: LabeledPoint(x[0], x[1]))

    return scaled
Пример #11
0
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))
Пример #12
0
 def test_model_setters(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
Пример #13
0
    def __init__(self):
        Dataset.__init__(self)

        # preparing the Data (Train and Test) : formatting and scaling then making it an RDD of LabeledPoints

        trainDirectory = HDFS_DIRECTORY + 'checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))
        ''' this block is for testing '''
Пример #14
0
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'striatum_train_mini.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.strip().split(' ')[:-1])
        labels = train.map(lambda _: _.strip().split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))

        testDirectory = HDFS_DIRECTORY + 'striatum_test_mini.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])

        # AN ISSUE HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        # in original LAL code they scaled testset with the scaler fitted from TRAINING set, but why?

        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))
Пример #15
0
def TrainLRModel(trainData, iterations, step,
                 miniBatchFraction):  # Logistic Regression
    srcFeatures = trainData.map(lambda line: line.features)
    print srcFeatures.first()
    scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures)
    srcLabel = trainData.map(lambda line: line.label)
    scaledFeature = scaler.transform(srcFeatures)
    print scaledFeature.first()
    scaledData = srcLabel.zip(scaledFeature)
    trainData = scaledData.map(
        lambda (label, features): LabeledPoint(label, features))
    model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \
                                            miniBatchFraction = miniBatchFraction)
    return model
Пример #16
0
def training(model_directory, libsvm, scaler):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    training_rdd = MLUtils.loadLibSVMFile(sc, libsvm)
    training_rdd.cache()
    if scaler == '1':
        label = training_rdd.map(lambda x: x.label)
        features = training_rdd.map(lambda x: x.features)

        scaler1 = StandardScaler().fit(features)
        data1 = label.zip(scaler1.transform(features))
        # convert into labeled point
        data2 = data1.map(lambda x: LabeledPoint(x[0], x[1]))
        model_logistic = LogisticRegressionWithLBFGS.train(data2)
    else:
        model_logistic = LogisticRegressionWithLBFGS.train(training_rdd)
    model_logistic.save(sc, model_directory)
Пример #17
0
def PrepareData(sc):
    '''
    准备数据
    :param sc:
    :return: (trainData, validationData, testData, categoriesMap)
    '''
    print('======================= 准备数据 =======================')
    # ----------------------------- 1. 导入并转换数据 -----------------------------
    print('========== [PrepareData] >>>> 开始导入 train.tsv 数据....')
    rawDataWithHeader = sc.textFile(Path + u'data/stumbleupon/train-100.tsv')
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace('\"', ''))
    lines = rData.map(lambda x: x.split('\t'))
    print('========== [PrepareData] >>>> 共计:' + str(lines.count()) + ' 项')
    # ----------------------------- 2. 建立训练评估所需数据RDD[LabeledPoint] -----------------------------
    # categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    # labelpointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1)))
    print('========== [PrepareData] >>>> 标准化之前:'),
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    print('')
    print('========== [PrepareData] >>>> 标准化之后:'),
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(
        featureRDD
    )  # 创建标准化刻度,由于数值特征字段单位不同而数字差异很大,故无法比较,因此需要标准化处理。这里不使用平均值密集输出,使用稀疏数据,因此设置withMean=False
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    labelpoint = labelRDD.zip(
        ScalerFeatureRDD)  # 使用zip将label与标准化后的特征字段结合起来建立labelpoint
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    # ----------------------------- 3. 以随机方式将数据分为3个部分并返回 -----------------------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print('========== [PrepareData] >>>> 将数据以随机方式差分为三个部分:trainData: ' +
          str(trainData.count()) + ' 项, validationData: ' +
          str(validationData.count()) + ' 项, testData: ' +
          str(testData.count()) + ' 项')
    # ----------------------------- 4. 返回元组数据 -----------------------------
    return (trainData, validationData, testData, categoriesMap)
Пример #18
0
class StandardScalerNormalizer:
    def __init__(self):
        self.normalizer = None

    def norm_train(self, train_data):
        train_features = train_data.map(lambda lp: lp.features)
        self.normalizer = StandardScaler().fit(train_features)

        # TODO: This can't be efficient...
        #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1]))
        labels = train_data.map(lambda lp: lp.label).collect()
        features = self.norm(train_features).collect()
        return get_df(zip(
            labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1]))

    def norm(self, data):
        return self.normalizer.transform(data)

    def __str__(self):
        return 'StandardScaler'
def PrepareData(sc):
    #---------------------1. 导入并转换数据---------------------
    global Path
    if sc.master[:5] == "local" or sc.master[:5] == "spark":
        Path = "file:/Users/johnnie/pythonwork/workspace/PythonProject/data/"
    else:
        Path = "hdfs://localhost:9000/user/hduser/test/data/"

    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path + "train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")

    #---------------------2. 建立训练评估所需数据RDD[LabeledPoint]---------------------
    print("标准化之前:")
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    print(featureRDD.first())
    print("\n")
    print("标准化之后:")
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    print(ScalerFeatureRDD.first())
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    # r[0]是label
    # r[1]是features
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))

    #---------------------3. 以随机方式将数据分为3个部分并返回---------------------
    trainData, validationData, testData = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData: " + str(trainData.count()) + " validationData: " +
          str(validationData.count()) + " testData: " + str(testData.count()))

    return trainData, validationData, testData, categoriesMap
Пример #20
0
# 27 = tempo
# 28 = time_signature

allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)

# label data

# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))

labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)

labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))

# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))

labeledData.take(3)

# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()

labeledNotRock = labeledData.filter(lambda p: p.label != 1.0)
Пример #21
0
#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean=False, withStd=True).fit(
    features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(
    scaler.transform(features))  #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect()) / num_folds
                 )  #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
i = 0
j = partitionSize
data = data.collect()
cv_error_storage = []
Пример #22
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
Пример #23
0
    # step 1 - create spark context
    conf = SparkConf().setAppName("KMeans-Content")\
       .set("spark.executor.memory","1g")
    sc = SparkContext()


    # step 2 - load in input file
    data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
    labels = data.map(lambda x:x.label)
    features = data.map(lambda x:x.features)

  
    # step 3 - standarize the data with unit values and 0 mean
    scaler = StandardScaler(withMean=False,withStd=True).fit(features)

    data2 = labels.zip(scaler.transform(features))

    numFeatures = len(data2.values().take(10)[0])
    print "Type of data2: ",type(data2) #RDD
    print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
    print "Sample: ",data2.values().take(1)[0]

    # splitting up the data to training, validation and testing models.
    train,val,test = data2.randomSplit([.80,.10,.10])


    print "Training Dataset Size:",train.count()
    print "Validation Dataset size:",val.count()
    print "Test Dataset Size:",test.count()

Пример #24
0
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)

    sc.stop()
Пример #25
0
	predict = model.predict(data.map(lambda x:x.feature))
	scoreAndLabel = predict.zip(data.map(lambda x:x.label))
	metrics = BinaryClassificationMetrics(scoreAndLabel)
	return metrics.areaUderROC
	
#查看decision tree的结构
model.toDebugString()

# 逻辑回归
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import StandardScaler

###标准化

stdscaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
scaledFeature = stdscaler.transform(featureRDD)
labelPoint = labelRDD.zip(scaledFeature)
labelPointRDD = labelPoint.map(lambda x:LabeledPoint(x[0],x[1]))

#model
model = LogisticRegressionWithSGD.train(labelPointRDD,num_iter,learning_rate,batch_size)
	

	
# svm
from pyspark.mllib.classification import SVMWithSGD

model = SVMWithSGD(trainData,num_iter,learning_rate,regParam)

#naiveBayes
from pyspark.mllib.classification import NaiveBayes
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)


#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features))   #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds)   #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
i = 0
j = partitionSize
data = data.collect()
cv_error_storage = []

#10 fold is better, but I use 5 here in the interest of time
Пример #27
0
    print "max of each column:"
    print matrixSummary.max()
    print "variance of each column:"
    print matrixSummary.variance()


#distribution_data()

labels = data.map(lambda p: p.label)
features = data.map(lambda p: p.features)

vectors = data.map(lambda p: p.features)
scaler = StandardScaler(withMean=True, withStd=True).fit(vectors)
#scaledData=data.map(lambda p:LabeledPoint(p.label, scaler.transform(p.features)))

scaled_data = labels.zip(scaler.transform(features))
scaledData = scaled_data.map(lambda (x, y): LabeledPoint(x, y))
#scaledData.cache()
#print scaledData.first().features


def predict_SVMWithSGD(numIterations, step, regParam, regType):
    """
    SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations, default 100
    step: the step parameter used in SGD, default 1.0
    regParam: the regularizer parameter, default 0.01
    miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0
    initialWeights: the initial weights, default None
    regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization)
Пример #28
0
# Ok, reload the data
#
rdd_loaded = sc.pickleFile(
    'hdfs://br156-161.ifremer.fr:8020/tmp/venthsalia_hdp/rdd.pkl')
rdd_loaded = rdd_loaded.cache()
rdd_loaded.count()
rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x))
print rdd_b.count()
print rdd_b.take(1)

#
# Profiles standardisation
#
new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b)
print type(new_scalar)
scaler3 = new_scalar.transform(rdd_b)

#
# Profiles compression with PCA
#
model = PCAmllib(10).fit(scaler3)
print type(model)
transformed = model.transform(scaler3)
print type(transformed)
print transformed.count()
print transformed.first()

#
# Train a Profiles classification model with KMean
#
NBCLUSTERS = 8
Пример #29
0
def extract_label(fields):
    label = fields[-1]
    return label


from pyspark.mllib.regression import LabeledPoint

# labelPointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1)))

labelRDD = lines.map(lambda r: extract_label(r))
featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, -1))
from pyspark.mllib.feature import StandardScaler

stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)
labelPoint = labelRDD.zip(ScalerFeatureRDD)
labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))

trainData, validationData, testData = labelPointRDD.randomSplit([8, 1, 1])

# temporary save data into memory to speed up the later process
trainData.persist()
validationData.persist()
testData.persist()

# train model
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainData,
                                     numClasses=2,
Пример #30
0
def norm(features):
    scaler = StandardScaler(withMean=False, withStd=False).fit(features)
    return scaler.transform(features)
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

def parsePoint(data):
	#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
	return LabeledPoint(data[0],data[1:])

# store the data from cassandra to a data frame and remove the NA value 
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()

data=data.filter("year>0").na.drop()
print data.count()


# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data

# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)

# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)

# Evaluate the model on training data
print ("intercept",model.intercept)
print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights)

sc.stop()
Пример #32
0
if __name__ == "__main__":

    conf = SparkConf()
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="MNIST_KMEANS", conf=conf)

    data = sc.textFile('train.csv')  # ingest the comma delimited file
    header = data.first()  # extract header
    data = data.filter(lambda x: x != header)  # remove the header
    trainingData = data.map(parsePoint)  # parse file to generate an RDD
    trainingData_wo_labels = trainingData.map(lambda x: x[1])  # remove label

    # normalize vector
    scaler = StandardScaler(withMean=True,
                            withStd=True).fit(trainingData_wo_labels)
    trainingData_wo_labels = scaler.transform(trainingData_wo_labels)

    model = KMeans.train(trainingData_wo_labels,
                         10,
                         maxIterations=250,
                         initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = model.centers[model.predict(
            point)]  # get centroid for cluster
        return math.sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = trainingData_wo_labels.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))
Пример #33
0
    sc.setLogLevel("warn")
    user_map = load_user_map(sc)
    # 加载训练数据
    train_data = load_train_data(sc)
    # 设置数据的用户信息数据
    train_data_user_info = set_train_user_info(train_data, user_map)
    # user_id  merchant_id age_range gender label
    train_data_user_info.cache()
    stand_train_data_user_info = train_data_user_info.map(
        lambda user: user[0:4])
    stand_train_data_user_info_label = train_data_user_info.map(
        lambda user: user[4])

    #训练数据标准化
    std_scaler = StandardScaler(True, True).fit(stand_train_data_user_info)
    stand_train_data_user_info = std_scaler.transform(
        stand_train_data_user_info)

    train_data_user_info = stand_train_data_user_info_label.zip(
        stand_train_data_user_info)
    # 构建标签数据
    train_data_user_info = build_point(train_data_user_info)
    numIterations = 100

    train_data_user_info.cache()
    #训练模型
    model = SVMWithSGD.train(train_data_user_info, numIterations)
    #model = DecisionTree.trainClassifier(train_data_user_info,numIterations,2,{})

    # 加载测试数据
    test_data = load_test_data(sc)
    # 设置数据的用户信息数据
Пример #34
0
## in the gaussian case, we have achieved independence between variables.
## If the source variables are gaussian ICA is not required and PCA is sufficient.
    
    
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
# U is whitened and projected onto principal components subspace.
Пример #35
0
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # Without converting the features into dense vectors, transformation with zero mean will raise
    # exception on sparse vector.
    # data2 will be unit variance and zero mean.
    data2 = label.zip(
        scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)
Пример #36
0
class MVA(object):
  
    """
    
    This class solves the MVA methods for feature extraction

    """

    _typeMVA=None
    _typeReg=None
    _typeNorm=None
    _tol=None
    _numVariables=None
    _M =None
    _R= None
    _data=None #An RDD with format ([y0, y1, ..., yM], [x0, x1, ..., xN])
    _normdata = None
    _scaler=None
    _U=None
    _regParam=None 
    _step=None
    _iterations=None
    _max_Ustep=None
    _W=None
    
    def __init__(self, typeMVA, typeReg,typeNorm, tol,numFeatures ,regParam=0.01, step=1e-3, iterations=100, max_Ustep=10):
        
        """Class initializer
        :param typeMVA:
            Type of MVA method: PCA, OPLS or CCA
        :param typeReg:
            Type of Regularization used:
        :param typeNorm:
            Type of Normalization used:
        """

        self._typeMVA=typeMVA
        self._typeReg=typeReg
        self._regParam=regParam
        self._typeNorm=typeNorm
        self._tol=tol
        self._R=numFeatures
        self._step=step
        self._iterations=iterations
        self._max_Ustep=max_Ustep
        
        if typeMVA not in ['PCA', 'OPLS', 'CCA']:
            print 'The type of MVA is not correct'


    def prepareData(self, data):

        if data.filter(lambda x: not isinstance(x,LabeledPoint)).count() == 0:
            #Case 1: All points in dataset are LabeledPoints
            #Check if number of features in X is constant
            x_len = data.map(lambda x: len(Vectors.dense(x.features.toArray()))).cache()
            self._numVariables = x_len.first()
            if len(x_len.distinct().collect())!=1:
                print 'All feature vectors should have the same length. Aborting.'
                return False
            try:
                if self._typeMVA=='PCA':
                    self._data = (data.map(lambda x: Vectors.dense(x.features.toArray()))
                                      .map(lambda x: (x, x)))
                    
                    self._M = self._numVariables
                   
                    
                else:
                    set_classes = data.map(lambda x: x.label).distinct().collect()
                    
                    self._M = len(set_classes)
                    
                        
                    
                    
                    self._data = data.map(lambda x: (Vectors.dense(label_binarize([x.label], classes=set_classes).flatten()), 
                                            Vectors.dense(x.features.toArray())))
                return True
            except:
                return False
        
        elif data.filter(lambda x: not isinstance(x,tuple)).count() ==0:
            #Case 2: All points in dataset are tuples of numpy arrays
            try:
                x_len = data.map(lambda x: len(Vectors.dense(x[1]))).cache()
                self._numVariables = x_len.first()
                if len(x_len.distinct().collect())!=1:
                    print 'All feature vectors should have the same length. Aborting.'
                    return False
                y_len = data.map(lambda x: len(Vectors.dense(x[0]))).cache()
                
                
                self._M = y_len.first()
                
                        
                    
                if len(y_len.distinct().collect())!=1:
                    print 'All label vectors should have the same length. Aborting.'
                    return False
                self._data = data.map(lambda x: (Vectors.dense(x[0]),
                                                 Vectors.dense(x[1])))
                return True
            except:
                return False
    
        elif self._typeMVA == 'PCA':
            #Case 3: If MVA is PCA, then RDD elements should be numpy arrays
            try:
                x_len = data.map(lambda x: len(Vectors.dense(x))).cache()
                self._numVariables = x_len.first()
                
               
                self._M = self._numVariables
                
                 
                
                if len(x_len.distinct().collect())!=1:
                    print 'All feature vectors should have the same length. Aborting.'
                    return False
                self._data = data.map(lambda x: (Vectors.dense(x),
                                                 Vectors.dense(x)))
                return True
            except:
                return False

        return False


    def calcCov(self,typeCov):
        """
        This function calculates the covariance matrix for the training data

        :param typeCov:
            Type of covariance matrix to be calculated, it can be Cyx or Cyy
        """

        if typeCov == 'Cyx' :
            Cyx = self._data.map(lambda x : np.dot(x[0][:,np.newaxis],x[1][:,np.newaxis].T)).mean()
            Cov=Cyx
        elif typeCov == 'Cyy':
            Cyy = self._data.map(lambda x : np.dot(x[0][:,np.newaxis],x[0][:,np.newaxis].T)).mean()
            Cov=Cyy
        else:
            print 'This type of covariance matrix cannot be calculated'
        return Cov


    def createOmega(self):
        """
        This function creates the Omega matrix for the step U and step W,
        it depends of the type of MVA method.

        """
        if self._typeMVA in ["PCA", "OPLS"] :
            Omega = np.eye(self._M)

        else :
            Cyy = self.calcCov('Cyy')
            Omega=np.linalg.inv(Cyy)

        return Omega

        
    def calcFrobeniusNorm(self,Uold,Unew):
        """
        This function calculate the Frobenius norm between two matrices

        """
        A=Uold-Unew
        return lin.norm(A,'fro')    


    def normalizer(self):
        """
        This function normalize the training data
  
        """
  
        if self._typeNorm == 'norm':
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=True).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            RDD_Y = self._data.map(lambda x: x[0])
            RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
        else:
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=False).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            if self._typeMVA == 'PCA':
                RDD_Y = self._data.map(lambda x: x[0])
                RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
            else:
                RDD_Y_norm = self._data.map(lambda x: x[0])

        # Create a new RDD of LabeledPoint data using the normalized features
        self._normdata = RDD_Y_norm.zip(RDD_X_norm)


    def stepU(self,W,Omega, R):
        """
        This function calculate the step U 
  
        :param W:
            W matrix
        :param Omega:
            Omega matrix
        :param R:
            Number of distinct classes minus one
        
        """
        U = np.empty((R,self._numVariables))

        for r in range(R):
            print 'Extracting projection vector ' + str(r) + ' out of ' + str(len(range(R)))
            Wr = W[:,r][:,np.newaxis]
            def createPseudoY(Y, W, Omega):
                """
                This function calculates Y' = W^TOmegaY for the step U
  
                :param Y:
                    RDD of labels or outputs
                :param W:
                    W matrix calcutated in step W
                :param Omega:
                    Omega matrix 
                """  
                return np.squeeze(W.T.dot(Omega).dot(Y.T))
            PseudoY = self._normdata.map (lambda x : createPseudoY(x[0], Wr, Omega))
            Datar = self._normdata.zip(PseudoY).map(lambda x: LabeledPoint(x[1], x[0][1]))
            # Build the model
            lr = LinearRegressionWithSGD.train(Datar, iterations=self._iterations, regType=self._typeReg, regParam=self._regParam, step=self._step)
            U[r,:] = lr.weights

        return U


    def stepW(self, U, Cyx, Omega, Omega_1):
        """
        This function calculates the step W
  
        :param U:
            U matrix calculated in step U
        :param Cyx:
            The covariance matrix between the labels or outputs and the features
        :param Omega:
            Omega matrix
        :param Omega_1:
            The inverse of the omega matrix
        """
        print U.shape
        print Cyx.shape
        print Omega.shape
        A = Omega.dot(Cyx).dot(U.T)
        V, D, V2 = np.linalg.svd(A,full_matrices=False)
        W = np.dot(Omega_1,V)
        
        return W


    def computeMSE(self, U, W, trainingData):
        """
        This function compute de MSE

        :param U:
            U matrix 
        :param W:
            W matrix
        :param trainingData:
            RDD of training data
        """
        return trainingData.map(lambda x: np.mean(np.array(x.codedLabel - np.dot(W,np.dot(x.features, U.T)))**2)).mean()


    def fit(self, data):
        """
        This function fits the model. It calculates de matrix U where each 
        column is a vector containing the coefficients for each extracted feature.

        :param data:
            
        """

        if self.prepareData(data):
            
            Omega= self.createOmega()
            Omega_1=np.linalg.inv(Omega)
            num_Ustep=0
            
            #Normalize data
            self.normalizer()

            #Initialize U and W variables
           
            R = int(np.minimum(self._M-1, self._R))
            U_old = np.empty((R,self._numVariables))
            Cyx=self.calcCov('Cyx')
            W = self.stepW(U_old,Cyx,Omega,Omega_1)
            U_new = self.stepU(W,Omega,R)
            
            
            while (self.calcFrobeniusNorm(U_old,U_new) > self._tol) and (num_Ustep<self._max_Ustep) :
                U_old=U_new
                W = self.stepW(U_old,Cyx,Omega,Omega_1)
                U_new = self.stepU(W,Omega,R)
                num_Ustep=num_Ustep + 1
                
                if num_Ustep==self._max_Ustep :
                    print 'You have reach the max number of U step, change the tolerance'
                    
                print 'Frobenius norm error: ' + str(self.calcFrobeniusNorm(U_old,U_new))

            self._U=U_new
            self._W=W
    
    def predict(self, RDD_X2):
        """
        This function find relevant features by combining X=U^T*X2. It is
        needed to fit the model first

        :param sc: SparkContext
        :param RDD_X2:
          Training data 
        """
        if self._U != None:
            RDD_norm = self._scaler.transform(RDD_X2)
            U = self._U
            RDD=RDD_norm.map(lambda x: x.dot(U.T))
            return RDD  
        else :
            print 'You have to fit the model first'
Пример #37
0
        #ind_dict = {col_nbr:position_in_dict_list}
        ind_dict = {}
        dict_i = 0
        for e in Cat_Cols:
            new_coldict(dict_i, e, total_lines)  #add entries in cols to dicts
            ind_dict[e] = dict_i  #record col_nbr and dict_lst position
            dict_i += 1
        tsv_rdd = tsv_rdd.map(
            lambda x: string_freq(x))  #change string to frequent(float)

    #Kmeans Cols
    a = tsv_rdd.map(lambda x: np.array(gen_lst(x)))

    #normalization
    scaler1 = StandardScaler().fit(a)
    a = scaler1.transform(a)

    #Kmeans
    clusters = KMeans.train(a,
                            k_cl,
                            maxIterations=10,
                            initializationMode="random")

    col_ind_rdd = tsv_rdd.map(lambda x: x[-1])  #col_ind col
    a = a.zip(col_ind_rdd).map(
        lambda x: add_id(x))  #add col_ind col back to KMeans_cols
    rdd_w_clusts = a.map(lambda x: np.array(addclustercols(x)))

    sel_outlier = rdd_w_clusts.map(lambda x: (x[2],(x[0],x[1])))\
                              .sortByKey(False)\
                              .take(nbr_out)
Пример #38
0
print "Will load a dataset of size:\n\t", shape

rdd_data = sc.parallelize(flist).flatMap(reader('TEMP'))
first = rdd_data.first()

# In[Scaling]:

# Compute scaling parameters:
from pyspark.mllib.feature import StandardScaler, StandardScalerModel

scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data)

sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)

# Effectively reduce the dataset:
rdd_reduced = reducer.transform(rdd_norm)
# print type(rdd_reduced)

# In[Classification with k-mean]:
Пример #39
0
housingData = housingVals.map(toLabeledPoint)

#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]

#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []

while low < max_time: # Temp should run once
	timeseries = df.filter(lambda x: low < x.timestamp < high)	

	#if timeseries.count() > 0:
	features = timeseries.map(lambda row: row[1:])
		#print "Possible points"
		#print features.collect()

	model = StandardScaler().fit(features)
	features_t = model.transform(features)
	
	label = timeseries.map(lambda row: row[0])
	labeled_data = label.zip(features_t)

	final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
	
	model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
		#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
		#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
	modelList.append(model)
		

		#print ""
		#print "Model1 weights " + str(model.weights)
		#print ""
Пример #41
0
def main(argv):

	verbose = False

	dbpath = '/root/data/AdditionalFiles/'
	tagstring = 'rock'
	usealldata = False

	holdout = 0.1
	model_iterations = 100
	model_step = 1.0
	model_intercept = True

	# possible types logistic and svm
	model_type = 'logistic'

	try:
		opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
	except getopt.GetoptError:
		print 'rockTag.py -d <data path> -t <tag string>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print('rockTag.py -d <data path> -t <tag string>')
			sys.exit()
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("-d", "--datapath"):
			dbpath = arg
		elif opt in ("-t", "--tagstring"):
			tagstring = str(arg).lower()
		elif opt in ("-a", "--alldata"):
			usealldata = True
		elif opt in ("-m", "--model"):
			if str(arg).lower() in ['logistic','svm']:
				model_type = str(arg).lower
			else:
				print('valid models are logistic and svm')
				sys.exit()
		elif opt in ("-s", "--step"):
			model_step = float(arg)
		elif opt in ("-i", "--iterations"):
			model_iterations = int(arg)
		elif opt in ("-o", "--holdout"):
			holdout = float(arg)
			if holdout <= 0 | holdout >= 1:
				print('holdout must be greater than 0 and less than 1')
		elif opt in ("-c", "--intercept"):
			model_intercept = True

	if verbose:
		print('data path: ' + dbpath)
		print('tag string: ' + tagstring)

	labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)

	# scale features
	std = StandardScaler(True, True).fit(features)
	features = std.transform(features)

	# make labeled data
	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	if verbose: labeledData.take(3)

	# rebalance samples
	equalSampleData = rebalanceSample(labeledData, verbose=verbose)

	# split data
	trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
	if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)

	# train model
	if model_type == 'logistic':
		model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
	elif model_type == 'svm':
		model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)

	evalString = evaluateModel(model, testData)
	print(evalString)
Пример #42
0
indexed_train_bin = train_data.map(parseRowIndexingBinary)
indexed_test_bin = test_data.map(parseRowIndexingBinary)
oneHot_train_bin = train_data.map(parseRowOneHotBinary)
oneHot_test_bin = test_data.map(parseRowOneHotBinary)
indexed_train_reg = train_data.map(parseRowIndexingRegression)
indexed_test_reg = test_data.map(parseRowIndexingRegression)
oneHot_train_reg = train_data.map(parseRowOneHotRegression)
oneHot_test_reg = test_data.map(parseRowOneHotRegression)

## FEATURE SCALING ##

label = oneHot_train_reg.map(lambda x: x.label)
features = oneHot_train_reg.map(lambda x: x.features)
scaler = StandardScaler(withMean=False, withStd=True).fit(features)
data_temp_ = label.zip(
    scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
oneHot_train_reg_scaled = data_temp_.map(lambda x: LabeledPoint(x[0], x[1]))

label = oneHot_test_reg.map(lambda x: x.label)
features = oneHot_test_reg.map(lambda x: x.features)
scaler = StandardScaler(withMean=False, withStd=True).fit(features)
data_temp_ = label.zip(
    scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
oneHot_test_reg_scaled = data_temp_.map(lambda x: LabeledPoint(x[0], x[1]))

## CACHE-Y CACHE CACHE ##
indexed_train_bin.cache()
indexed_test_bin.cache()
oneHot_train_bin.cache()
oneHot_test_bin.cache()
indexed_train_reg.cache()