def prepareData(sc): print 'import training data' rawDataWithHeader = sc.textFile(Path + 'train.tsv') print rawDataWithHeader.take(10) header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x != header) rData = rawData.map(lambda x: x.replace("\"","")) lines = rData.map(lambda x: x.split("\t")) print lines.count() categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap() print categoriesMap labelRDD = lines.map(lambda r: extractLabel(r)) featureRDD = lines.map(lambda r: extractFeatures(r,categoriesMap,len(r)-1)) # print featureRDD.take(1) stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) # print ScalerFeatureRDD.take(1) labelPoint = labelRDD.zip(ScalerFeatureRDD) labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0],r[1])) # print labelPointRDD.take(1) (trainData, testData, validationData) = labelPointRDD.randomSplit([8, 1, 1]) print trainData.count() print testData.count() print validationData.count() return (trainData, testData, validationData, categoriesMap)
def test_model_setters(self): data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertIsNotNone(model.setWithMean(True)) self.assertIsNotNone(model.setWithStd(True)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- print "标准化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1)) for i in featureRDD.first(): print (str(i)+","), print "" print "标准化之后:", stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD) ScalerFeatureRDD=stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print (str(i)+","), labelpoint=labelRDD.zip(ScalerFeatureRDD) labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
def prepare_data(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") raw_data_with_header = sc.textFile(os.path.join(PATH, 'data/train.tsv')) header = raw_data_with_header.first() raw_data = raw_data_with_header.filter(lambda x: x!=header) # 去除 "" 按 \t 划分一个网页的不同字段 lines_rdd = raw_data.\ map(lambda x: x.replace("\"", "")).\ map(lambda x: x.split('\t')) print("共计: {}项".format(lines_rdd.count())) #---------------------2.数据标准化----------------------- # {新闻类别: 序号, } categories_map = lines_rdd.map(lambda fields: fields[3]).\ distinct().zipWithIndex().collectAsMap() label_rdd = lines_rdd.map(lambda r: get_label(r)) features_rdd = lines_rdd.map(lambda r: get_features(r, categories_map, len(r)-1)) scaler = StandardScaler(withMean=True, withStd=True).fit(features_rdd) stand_features = scaler.transform(features_rdd) #----------3.建立训练评估所需数据 RDD[LabeledPoint]------- LabeledPoint labeledpoint_rdd = label_rdd.zip(stand_features).map(lambda r: LabeledPoint(r[0], r[1])) #-----------4.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labeledpoint_rdd.randomSplit([0.8, 0.1, 0.1]) print("将数据分trainData: {0}, validationData: {1}, testData: {2}".format( trainData.count(), validationData.count(), testData.count() )) return (trainData, validationData, testData, categories_map) #返回数据
def PrepareData(sc): #----------------------1.匯入並轉換資料------------- print("開始匯入資料...") rawDataWithHeader = sc.textFile(Path + "data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共計:" + str(lines.count()) + "筆") #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]------------- print "標準化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print(str(i) + ","), print "" print "標準化之後:", stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print(str(i) + ","), labelpoint = labelRDD.zip(ScalerFeatureRDD) labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以隨機方式將資料分為3部份並且回傳------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("將資料分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #回傳資料
def PrepareData(sc): print("开始导入数据。。。") path = Path + "train.tsv" print(path) # 使用minPartitions=40,将数据分成40片,不然报错 rawDataWithHeader = sc.textFile(path, minPartitions=40) header = rawDataWithHeader.first() # 去掉首行,标题 rawData = rawDataWithHeader.filter(lambda x: x != header) # 去掉引号 rData = rawData.map(lambda x: x.replace("\"", "")) # 按照制表符分字段 lines = rData.map(lambda x: x.split("\t")) print("总共有:", str(lines.count())) #----2。创建训练所需的RDD数据 categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extractFeatures(r, categoriesMap, len(r) - 1)) print(featureRDD.first()) #----3.随机分成3部分数据返回 print("数据标准化之后===:") stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) scalerFeatureRDD = stdScaler.transform(featureRDD) print(scalerFeatureRDD.first()) labelPoint = labelRDD.zip(scalerFeatureRDD) labelpointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1])) (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("数据集划分为:trainData:", str(trainData.count()), "validationData:", str(validationData.count()), "testData:", str(testData.count())) return (trainData, validationData, testData, categoriesMap)
def PrepareData(sc): rawDataWithHeader = sc.textFile(Path + "data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("total " + str(lines.count())) print("=======before standare========") categoriesMap = lines.map(lambda fields: fields[3]) \ .distinct() \ .zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print(str(i) + ", ") print("=======after standare========") stdScale = StandardScaler(withMean=True, withStd=True).fit(featureRDD) scaleFeatureRDD = stdScale.transform(featureRDD) for i in scaleFeatureRDD.first(): print(str(i) + ",") labelPoint = labelRDD.zip(scaleFeatureRDD) labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1])) (trainData, validationData, testData) = labelPointRDD.randomSplit([8, 1, 1]) return (trainData, validationData, testData, categoriesMap)
def test_model_transform(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
def test_model_transform(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
def getScaledData(data): features = data.map(lambda x: x.features) label = data.map(lambda x: x.label) scaler = StandardScaler(withMean=True, withStd=True).fit(features) scaled = label\ .zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))\ .map(lambda x: LabeledPoint(x[0], x[1])) return scaled
def __init__(self): Dataset.__init__(self) trainDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_train.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.split(' ')[:-1]) labels = train.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(_[0], _[1])) testDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_test.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(_[0], _[1]))
def test_model_setters(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertIsNotNone(model.setWithMean(True)) self.assertIsNotNone(model.setWithStd(True)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
def __init__(self): Dataset.__init__(self) # preparing the Data (Train and Test) : formatting and scaling then making it an RDD of LabeledPoints trainDirectory = HDFS_DIRECTORY + 'checkerboard2x2_train.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.split(' ')[:-1]) labels = train.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features))\ .map(lambda _: LabeledPoint(_[0], _[1])) testDirectory = HDFS_DIRECTORY + 'checkerboard2x2_test.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features))\ .map(lambda _: LabeledPoint(_[0], _[1])) ''' this block is for testing '''
def __init__(self): Dataset.__init__(self) trainDirectory = HDFS_DIRECTORY + 'striatum_train_mini.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.strip().split(' ')[:-1]) labels = train.map(lambda _: _.strip().split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1])) testDirectory = HDFS_DIRECTORY + 'striatum_test_mini.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) # AN ISSUE HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # in original LAL code they scaled testset with the scaler fitted from TRAINING set, but why? scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))
def TrainLRModel(trainData, iterations, step, miniBatchFraction): # Logistic Regression srcFeatures = trainData.map(lambda line: line.features) print srcFeatures.first() scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures) srcLabel = trainData.map(lambda line: line.label) scaledFeature = scaler.transform(srcFeatures) print scaledFeature.first() scaledData = srcLabel.zip(scaledFeature) trainData = scaledData.map( lambda (label, features): LabeledPoint(label, features)) model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \ miniBatchFraction = miniBatchFraction) return model
def training(model_directory, libsvm, scaler): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") training_rdd = MLUtils.loadLibSVMFile(sc, libsvm) training_rdd.cache() if scaler == '1': label = training_rdd.map(lambda x: x.label) features = training_rdd.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) data1 = label.zip(scaler1.transform(features)) # convert into labeled point data2 = data1.map(lambda x: LabeledPoint(x[0], x[1])) model_logistic = LogisticRegressionWithLBFGS.train(data2) else: model_logistic = LogisticRegressionWithLBFGS.train(training_rdd) model_logistic.save(sc, model_directory)
def PrepareData(sc): ''' 准备数据 :param sc: :return: (trainData, validationData, testData, categoriesMap) ''' print('======================= 准备数据 =======================') # ----------------------------- 1. 导入并转换数据 ----------------------------- print('========== [PrepareData] >>>> 开始导入 train.tsv 数据....') rawDataWithHeader = sc.textFile(Path + u'data/stumbleupon/train-100.tsv') header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace('\"', '')) lines = rData.map(lambda x: x.split('\t')) print('========== [PrepareData] >>>> 共计:' + str(lines.count()) + ' 项') # ----------------------------- 2. 建立训练评估所需数据RDD[LabeledPoint] ----------------------------- # categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() # labelpointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1))) print('========== [PrepareData] >>>> 标准化之前:'), categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print('\t\t' + str(i) + '(' + str(type(i)) + '),'), print('') print('========== [PrepareData] >>>> 标准化之后:'), stdScaler = StandardScaler(withMean=False, withStd=True).fit( featureRDD ) # 创建标准化刻度,由于数值特征字段单位不同而数字差异很大,故无法比较,因此需要标准化处理。这里不使用平均值密集输出,使用稀疏数据,因此设置withMean=False ScalerFeatureRDD = stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print('\t\t' + str(i) + '(' + str(type(i)) + '),'), labelpoint = labelRDD.zip( ScalerFeatureRDD) # 使用zip将label与标准化后的特征字段结合起来建立labelpoint labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) # ----------------------------- 3. 以随机方式将数据分为3个部分并返回 ----------------------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print('========== [PrepareData] >>>> 将数据以随机方式差分为三个部分:trainData: ' + str(trainData.count()) + ' 项, validationData: ' + str(validationData.count()) + ' 项, testData: ' + str(testData.count()) + ' 项') # ----------------------------- 4. 返回元组数据 ----------------------------- return (trainData, validationData, testData, categoriesMap)
class StandardScalerNormalizer: def __init__(self): self.normalizer = None def norm_train(self, train_data): train_features = train_data.map(lambda lp: lp.features) self.normalizer = StandardScaler().fit(train_features) # TODO: This can't be efficient... #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1])) labels = train_data.map(lambda lp: lp.label).collect() features = self.norm(train_features).collect() return get_df(zip( labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1])) def norm(self, data): return self.normalizer.transform(data) def __str__(self): return 'StandardScaler'
def PrepareData(sc): #---------------------1. 导入并转换数据--------------------- global Path if sc.master[:5] == "local" or sc.master[:5] == "spark": Path = "file:/Users/johnnie/pythonwork/workspace/PythonProject/data/" else: Path = "hdfs://localhost:9000/user/hduser/test/data/" print("开始导入数据...") rawDataWithHeader = sc.textFile(Path + "train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #---------------------2. 建立训练评估所需数据RDD[LabeledPoint]--------------------- print("标准化之前:") categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) print(featureRDD.first()) print("\n") print("标准化之后:") stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) print(ScalerFeatureRDD.first()) labelpoint = labelRDD.zip(ScalerFeatureRDD) # r[0]是label # r[1]是features labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #---------------------3. 以随机方式将数据分为3个部分并返回--------------------- trainData, validationData, testData = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData: " + str(trainData.count()) + " validationData: " + str(validationData.count()) + " testData: " + str(testData.count())) return trainData, validationData, testData, categoriesMap
# 27 = tempo # 28 = time_signature allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data))) allData.take(3) # label data # only uses one feature for now # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]])) # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)])) labels = allData.map(lambda (tr, (rocks, data)): rocks) features = allData.map(lambda (tr, (rocks, data)): data) std = StandardScaler(True, True).fit(features) scaledFeatures = std.transform(features) labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data)) # uses all extracted # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data])) labeledData.take(3) # make sample sizes equal labeledRock = labeledData.filter(lambda p: p.label == 1.0) labeledRock.count() labeledRock.map(lambda p: p.features[0]).mean() nrock = labeledRock.count() labeledNotRock = labeledData.filter(lambda p: p.label != 1.0)
#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/" data = MLUtils.loadLibSVMFile(sc, path) labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) #normalize: #scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included) scaler = StandardScaler(withMean=False, withStd=True).fit( features) #becomes dense if using withMean. may run out of memory locally #convert data to dense vector to be normalized #data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) data2 = labels.zip( scaler.transform(features)) #use this line if having memory issues #hide 10% of the data for final test data, test = data2.randomSplit([.9, .1]) #get size of chunks for 10-fold cross-validation num_folds = 10 partitionSize = (len(data.collect()) / num_folds ) #parameterize this value as num_folds (in loop as well) #train/validate 10 times on each k i = 0 j = partitionSize data = data.collect() cv_error_storage = []
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
# step 1 - create spark context conf = SparkConf().setAppName("KMeans-Content")\ .set("spark.executor.memory","1g") sc = SparkContext() # step 2 - load in input file data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat") labels = data.map(lambda x:x.label) features = data.map(lambda x:x.features) # step 3 - standarize the data with unit values and 0 mean scaler = StandardScaler(withMean=False,withStd=True).fit(features) data2 = labels.zip(scaler.transform(features)) numFeatures = len(data2.values().take(10)[0]) print "Type of data2: ",type(data2) #RDD print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd print "Sample: ",data2.values().take(1)[0] # splitting up the data to training, validation and testing models. train,val,test = data2.randomSplit([.80,.10,.10]) print "Training Dataset Size:",train.count() print "Validation Dataset size:",val.count() print "Test Dataset Size:",test.count()
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # data2 will be unit variance and zero mean. data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect(): print(each) sc.stop()
predict = model.predict(data.map(lambda x:x.feature)) scoreAndLabel = predict.zip(data.map(lambda x:x.label)) metrics = BinaryClassificationMetrics(scoreAndLabel) return metrics.areaUderROC #查看decision tree的结构 model.toDebugString() # 逻辑回归 from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.feature import StandardScaler ###标准化 stdscaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD) scaledFeature = stdscaler.transform(featureRDD) labelPoint = labelRDD.zip(scaledFeature) labelPointRDD = labelPoint.map(lambda x:LabeledPoint(x[0],x[1])) #model model = LogisticRegressionWithSGD.train(labelPointRDD,num_iter,learning_rate,batch_size) # svm from pyspark.mllib.classification import SVMWithSGD model = SVMWithSGD(trainData,num_iter,learning_rate,regParam) #naiveBayes from pyspark.mllib.classification import NaiveBayes
path = sys.argv[1] #path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/" data = MLUtils.loadLibSVMFile(sc, path) labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) #normalize: #scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included) scaler = StandardScaler(withMean = False, withStd = True).fit(features) #becomes dense if using withMean. may run out of memory locally #convert data to dense vector to be normalized #data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) data2 = labels.zip(scaler.transform(features)) #use this line if having memory issues #hide 10% of the data for final test data, test = data2.randomSplit([.9, .1]) #get size of chunks for 10-fold cross-validation num_folds = 10 partitionSize = (len(data.collect())/num_folds) #parameterize this value as num_folds (in loop as well) #train/validate 10 times on each k i = 0 j = partitionSize data = data.collect() cv_error_storage = [] #10 fold is better, but I use 5 here in the interest of time
print "max of each column:" print matrixSummary.max() print "variance of each column:" print matrixSummary.variance() #distribution_data() labels = data.map(lambda p: p.label) features = data.map(lambda p: p.features) vectors = data.map(lambda p: p.features) scaler = StandardScaler(withMean=True, withStd=True).fit(vectors) #scaledData=data.map(lambda p:LabeledPoint(p.label, scaler.transform(p.features))) scaled_data = labels.zip(scaler.transform(features)) scaledData = scaled_data.map(lambda (x, y): LabeledPoint(x, y)) #scaledData.cache() #print scaledData.first().features def predict_SVMWithSGD(numIterations, step, regParam, regType): """ SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001) data: the training data, an RDD of LabeledPoint iterations: the number of iterations, default 100 step: the step parameter used in SGD, default 1.0 regParam: the regularizer parameter, default 0.01 miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0 initialWeights: the initial weights, default None regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization)
# Ok, reload the data # rdd_loaded = sc.pickleFile( 'hdfs://br156-161.ifremer.fr:8020/tmp/venthsalia_hdp/rdd.pkl') rdd_loaded = rdd_loaded.cache() rdd_loaded.count() rdd_b = rdd_loaded.flatMap(lambda x: x[2]).map(lambda x: Vectors.dense(x)) print rdd_b.count() print rdd_b.take(1) # # Profiles standardisation # new_scalar = StandardScaler(withMean=True, withStd=True).fit(rdd_b) print type(new_scalar) scaler3 = new_scalar.transform(rdd_b) # # Profiles compression with PCA # model = PCAmllib(10).fit(scaler3) print type(model) transformed = model.transform(scaler3) print type(transformed) print transformed.count() print transformed.first() # # Train a Profiles classification model with KMean # NBCLUSTERS = 8
def extract_label(fields): label = fields[-1] return label from pyspark.mllib.regression import LabeledPoint # labelPointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1))) labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, -1)) from pyspark.mllib.feature import StandardScaler stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) labelPoint = labelRDD.zip(ScalerFeatureRDD) labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1])) trainData, validationData, testData = labelPointRDD.randomSplit([8, 1, 1]) # temporary save data into memory to speed up the later process trainData.persist() validationData.persist() testData.persist() # train model from pyspark.mllib.tree import DecisionTree model = DecisionTree.trainClassifier(trainData, numClasses=2,
def norm(features): scaler = StandardScaler(withMean=False, withStd=False).fit(features) return scaler.transform(features)
logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) def parsePoint(data): #return LabeledPoint(data[3],np.append(data[0:3],data[4:])) return LabeledPoint(data[0],data[1:]) # store the data from cassandra to a data frame and remove the NA value data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF() data=data.filter("year>0").na.drop() print data.count() # Scale the features with Standard Scaler data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column scaledData = scaler.transform(data2)# transform our data # Transform to a labelled vector parsedData = scaledData.map(parsePoint) # # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True) # Evaluate the model on training data print ("intercept",model.intercept) print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights) sc.stop()
if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="MNIST_KMEANS", conf=conf) data = sc.textFile('train.csv') # ingest the comma delimited file header = data.first() # extract header data = data.filter(lambda x: x != header) # remove the header trainingData = data.map(parsePoint) # parse file to generate an RDD trainingData_wo_labels = trainingData.map(lambda x: x[1]) # remove label # normalize vector scaler = StandardScaler(withMean=True, withStd=True).fit(trainingData_wo_labels) trainingData_wo_labels = scaler.transform(trainingData_wo_labels) model = KMeans.train(trainingData_wo_labels, 10, maxIterations=250, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = model.centers[model.predict( point)] # get centroid for cluster return math.sqrt(sum([x**2 for x in (point - center)])) WSSSE = trainingData_wo_labels.map(lambda point: error(point)).reduce( lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))
sc.setLogLevel("warn") user_map = load_user_map(sc) # 加载训练数据 train_data = load_train_data(sc) # 设置数据的用户信息数据 train_data_user_info = set_train_user_info(train_data, user_map) # user_id merchant_id age_range gender label train_data_user_info.cache() stand_train_data_user_info = train_data_user_info.map( lambda user: user[0:4]) stand_train_data_user_info_label = train_data_user_info.map( lambda user: user[4]) #训练数据标准化 std_scaler = StandardScaler(True, True).fit(stand_train_data_user_info) stand_train_data_user_info = std_scaler.transform( stand_train_data_user_info) train_data_user_info = stand_train_data_user_info_label.zip( stand_train_data_user_info) # 构建标签数据 train_data_user_info = build_point(train_data_user_info) numIterations = 100 train_data_user_info.cache() #训练模型 model = SVMWithSGD.train(train_data_user_info, numIterations) #model = DecisionTree.trainClassifier(train_data_user_info,numIterations,2,{}) # 加载测试数据 test_data = load_test_data(sc) # 设置数据的用户信息数据
## in the gaussian case, we have achieved independence between variables. ## If the source variables are gaussian ICA is not required and PCA is sufficient. # Code for PCA and whitening the dataset. from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix from pyspark.mllib.feature import StandardScaler from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix from sklearn import datasets # create the standardizer model for standardizing the dataset X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) ) scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy() num_rows = X_rm.numRows() # U is whitened and projected onto principal components subspace.
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # Without converting the features into dense vectors, transformation with zero mean will raise # exception on sparse vector. # data2 will be unit variance and zero mean. data2 = label.zip( scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect(): print(each)
class MVA(object): """ This class solves the MVA methods for feature extraction """ _typeMVA=None _typeReg=None _typeNorm=None _tol=None _numVariables=None _M =None _R= None _data=None #An RDD with format ([y0, y1, ..., yM], [x0, x1, ..., xN]) _normdata = None _scaler=None _U=None _regParam=None _step=None _iterations=None _max_Ustep=None _W=None def __init__(self, typeMVA, typeReg,typeNorm, tol,numFeatures ,regParam=0.01, step=1e-3, iterations=100, max_Ustep=10): """Class initializer :param typeMVA: Type of MVA method: PCA, OPLS or CCA :param typeReg: Type of Regularization used: :param typeNorm: Type of Normalization used: """ self._typeMVA=typeMVA self._typeReg=typeReg self._regParam=regParam self._typeNorm=typeNorm self._tol=tol self._R=numFeatures self._step=step self._iterations=iterations self._max_Ustep=max_Ustep if typeMVA not in ['PCA', 'OPLS', 'CCA']: print 'The type of MVA is not correct' def prepareData(self, data): if data.filter(lambda x: not isinstance(x,LabeledPoint)).count() == 0: #Case 1: All points in dataset are LabeledPoints #Check if number of features in X is constant x_len = data.map(lambda x: len(Vectors.dense(x.features.toArray()))).cache() self._numVariables = x_len.first() if len(x_len.distinct().collect())!=1: print 'All feature vectors should have the same length. Aborting.' return False try: if self._typeMVA=='PCA': self._data = (data.map(lambda x: Vectors.dense(x.features.toArray())) .map(lambda x: (x, x))) self._M = self._numVariables else: set_classes = data.map(lambda x: x.label).distinct().collect() self._M = len(set_classes) self._data = data.map(lambda x: (Vectors.dense(label_binarize([x.label], classes=set_classes).flatten()), Vectors.dense(x.features.toArray()))) return True except: return False elif data.filter(lambda x: not isinstance(x,tuple)).count() ==0: #Case 2: All points in dataset are tuples of numpy arrays try: x_len = data.map(lambda x: len(Vectors.dense(x[1]))).cache() self._numVariables = x_len.first() if len(x_len.distinct().collect())!=1: print 'All feature vectors should have the same length. Aborting.' return False y_len = data.map(lambda x: len(Vectors.dense(x[0]))).cache() self._M = y_len.first() if len(y_len.distinct().collect())!=1: print 'All label vectors should have the same length. Aborting.' return False self._data = data.map(lambda x: (Vectors.dense(x[0]), Vectors.dense(x[1]))) return True except: return False elif self._typeMVA == 'PCA': #Case 3: If MVA is PCA, then RDD elements should be numpy arrays try: x_len = data.map(lambda x: len(Vectors.dense(x))).cache() self._numVariables = x_len.first() self._M = self._numVariables if len(x_len.distinct().collect())!=1: print 'All feature vectors should have the same length. Aborting.' return False self._data = data.map(lambda x: (Vectors.dense(x), Vectors.dense(x))) return True except: return False return False def calcCov(self,typeCov): """ This function calculates the covariance matrix for the training data :param typeCov: Type of covariance matrix to be calculated, it can be Cyx or Cyy """ if typeCov == 'Cyx' : Cyx = self._data.map(lambda x : np.dot(x[0][:,np.newaxis],x[1][:,np.newaxis].T)).mean() Cov=Cyx elif typeCov == 'Cyy': Cyy = self._data.map(lambda x : np.dot(x[0][:,np.newaxis],x[0][:,np.newaxis].T)).mean() Cov=Cyy else: print 'This type of covariance matrix cannot be calculated' return Cov def createOmega(self): """ This function creates the Omega matrix for the step U and step W, it depends of the type of MVA method. """ if self._typeMVA in ["PCA", "OPLS"] : Omega = np.eye(self._M) else : Cyy = self.calcCov('Cyy') Omega=np.linalg.inv(Cyy) return Omega def calcFrobeniusNorm(self,Uold,Unew): """ This function calculate the Frobenius norm between two matrices """ A=Uold-Unew return lin.norm(A,'fro') def normalizer(self): """ This function normalize the training data """ if self._typeNorm == 'norm': #Normalize input features RDD_X = self._data.map(lambda x: x[1]) self._scaler = StandardScaler(withMean=True, withStd=True).fit(RDD_X) RDD_X_norm = self._scaler.transform(RDD_X) RDD_Y = self._data.map(lambda x: x[0]) RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y) else: #Normalize input features RDD_X = self._data.map(lambda x: x[1]) self._scaler = StandardScaler(withMean=True, withStd=False).fit(RDD_X) RDD_X_norm = self._scaler.transform(RDD_X) if self._typeMVA == 'PCA': RDD_Y = self._data.map(lambda x: x[0]) RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y) else: RDD_Y_norm = self._data.map(lambda x: x[0]) # Create a new RDD of LabeledPoint data using the normalized features self._normdata = RDD_Y_norm.zip(RDD_X_norm) def stepU(self,W,Omega, R): """ This function calculate the step U :param W: W matrix :param Omega: Omega matrix :param R: Number of distinct classes minus one """ U = np.empty((R,self._numVariables)) for r in range(R): print 'Extracting projection vector ' + str(r) + ' out of ' + str(len(range(R))) Wr = W[:,r][:,np.newaxis] def createPseudoY(Y, W, Omega): """ This function calculates Y' = W^TOmegaY for the step U :param Y: RDD of labels or outputs :param W: W matrix calcutated in step W :param Omega: Omega matrix """ return np.squeeze(W.T.dot(Omega).dot(Y.T)) PseudoY = self._normdata.map (lambda x : createPseudoY(x[0], Wr, Omega)) Datar = self._normdata.zip(PseudoY).map(lambda x: LabeledPoint(x[1], x[0][1])) # Build the model lr = LinearRegressionWithSGD.train(Datar, iterations=self._iterations, regType=self._typeReg, regParam=self._regParam, step=self._step) U[r,:] = lr.weights return U def stepW(self, U, Cyx, Omega, Omega_1): """ This function calculates the step W :param U: U matrix calculated in step U :param Cyx: The covariance matrix between the labels or outputs and the features :param Omega: Omega matrix :param Omega_1: The inverse of the omega matrix """ print U.shape print Cyx.shape print Omega.shape A = Omega.dot(Cyx).dot(U.T) V, D, V2 = np.linalg.svd(A,full_matrices=False) W = np.dot(Omega_1,V) return W def computeMSE(self, U, W, trainingData): """ This function compute de MSE :param U: U matrix :param W: W matrix :param trainingData: RDD of training data """ return trainingData.map(lambda x: np.mean(np.array(x.codedLabel - np.dot(W,np.dot(x.features, U.T)))**2)).mean() def fit(self, data): """ This function fits the model. It calculates de matrix U where each column is a vector containing the coefficients for each extracted feature. :param data: """ if self.prepareData(data): Omega= self.createOmega() Omega_1=np.linalg.inv(Omega) num_Ustep=0 #Normalize data self.normalizer() #Initialize U and W variables R = int(np.minimum(self._M-1, self._R)) U_old = np.empty((R,self._numVariables)) Cyx=self.calcCov('Cyx') W = self.stepW(U_old,Cyx,Omega,Omega_1) U_new = self.stepU(W,Omega,R) while (self.calcFrobeniusNorm(U_old,U_new) > self._tol) and (num_Ustep<self._max_Ustep) : U_old=U_new W = self.stepW(U_old,Cyx,Omega,Omega_1) U_new = self.stepU(W,Omega,R) num_Ustep=num_Ustep + 1 if num_Ustep==self._max_Ustep : print 'You have reach the max number of U step, change the tolerance' print 'Frobenius norm error: ' + str(self.calcFrobeniusNorm(U_old,U_new)) self._U=U_new self._W=W def predict(self, RDD_X2): """ This function find relevant features by combining X=U^T*X2. It is needed to fit the model first :param sc: SparkContext :param RDD_X2: Training data """ if self._U != None: RDD_norm = self._scaler.transform(RDD_X2) U = self._U RDD=RDD_norm.map(lambda x: x.dot(U.T)) return RDD else : print 'You have to fit the model first'
#ind_dict = {col_nbr:position_in_dict_list} ind_dict = {} dict_i = 0 for e in Cat_Cols: new_coldict(dict_i, e, total_lines) #add entries in cols to dicts ind_dict[e] = dict_i #record col_nbr and dict_lst position dict_i += 1 tsv_rdd = tsv_rdd.map( lambda x: string_freq(x)) #change string to frequent(float) #Kmeans Cols a = tsv_rdd.map(lambda x: np.array(gen_lst(x))) #normalization scaler1 = StandardScaler().fit(a) a = scaler1.transform(a) #Kmeans clusters = KMeans.train(a, k_cl, maxIterations=10, initializationMode="random") col_ind_rdd = tsv_rdd.map(lambda x: x[-1]) #col_ind col a = a.zip(col_ind_rdd).map( lambda x: add_id(x)) #add col_ind col back to KMeans_cols rdd_w_clusts = a.map(lambda x: np.array(addclustercols(x))) sel_outlier = rdd_w_clusts.map(lambda x: (x[2],(x[0],x[1])))\ .sortByKey(False)\ .take(nbr_out)
print "Will load a dataset of size:\n\t", shape rdd_data = sc.parallelize(flist).flatMap(reader('TEMP')) first = rdd_data.first() # In[Scaling]: # Compute scaling parameters: from pyspark.mllib.feature import StandardScaler, StandardScalerModel scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data) sample_mean = scaler.call('mean') # Effectively scale the dataset: rdd_norm = scaler.transform(rdd_data) # In[Reduction]: # Compute PCA new dimensions: from pyspark.mllib.feature import PCA as PCAmllib Neof = 20 reducer = PCAmllib(Neof).fit(rdd_norm) # print type(reducer) # Effectively reduce the dataset: rdd_reduced = reducer.transform(rdd_norm) # print type(rdd_reduced) # In[Classification with k-mean]:
housingData = housingVals.map(toLabeledPoint) #Section 7.4.5 sets = housingData.randomSplit([0.8, 0.2]) housingTrain = sets[0] housingValid = sets[1] #Section 7.4.6 from pyspark.mllib.feature import StandardScaler scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features)) trainLabel = housingTrain.map(lambda x: x.label) trainFeatures = housingTrain.map(lambda x: x.features) validLabel = housingValid.map(lambda x: x.label) validFeatures = housingValid.map(lambda x: x.features) trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) #Section 7.5 from pyspark.mllib.regression import LinearRegressionWithSGD alg = LinearRegressionWithSGD() trainScaled.cache() validScaled.cache() model = alg.train(trainScaled, iterations=200, intercept=True) #Section 7.5.1 validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts.collect() import math RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
max_time = 23 * 3600 + 59 * 60 + 59 #max_time = 16 * 60 low = 0 high = 15 * 60 modelList = [] while low < max_time: # Temp should run once timeseries = df.filter(lambda x: low < x.timestamp < high) #if timeseries.count() > 0: features = timeseries.map(lambda row: row[1:]) #print "Possible points" #print features.collect() model = StandardScaler().fit(features) features_t = model.transform(features) label = timeseries.map(lambda row: row[0]) labeled_data = label.zip(features_t) final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1])) model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True) #model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True) #model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True) modelList.append(model) #print "" #print "Model1 weights " + str(model.weights) #print ""
def main(argv): verbose = False dbpath = '/root/data/AdditionalFiles/' tagstring = 'rock' usealldata = False holdout = 0.1 model_iterations = 100 model_step = 1.0 model_intercept = True # possible types logistic and svm model_type = 'logistic' try: opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"]) except getopt.GetoptError: print 'rockTag.py -d <data path> -t <tag string>' sys.exit(2) for opt, arg in opts: if opt == '-h': print('rockTag.py -d <data path> -t <tag string>') sys.exit() elif opt in ("-v", "--verbose"): verbose = True elif opt in ("-d", "--datapath"): dbpath = arg elif opt in ("-t", "--tagstring"): tagstring = str(arg).lower() elif opt in ("-a", "--alldata"): usealldata = True elif opt in ("-m", "--model"): if str(arg).lower() in ['logistic','svm']: model_type = str(arg).lower else: print('valid models are logistic and svm') sys.exit() elif opt in ("-s", "--step"): model_step = float(arg) elif opt in ("-i", "--iterations"): model_iterations = int(arg) elif opt in ("-o", "--holdout"): holdout = float(arg) if holdout <= 0 | holdout >= 1: print('holdout must be greater than 0 and less than 1') elif opt in ("-c", "--intercept"): model_intercept = True if verbose: print('data path: ' + dbpath) print('tag string: ' + tagstring) labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata) # scale features std = StandardScaler(True, True).fit(features) features = std.transform(features) # make labeled data labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data)) if verbose: labeledData.take(3) # rebalance samples equalSampleData = rebalanceSample(labeledData, verbose=verbose) # split data trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout]) if verbose: trainData.map(lambda p: (p.label, p.features)).take(3) # train model if model_type == 'logistic': model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) elif model_type == 'svm': model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) evalString = evaluateModel(model, testData) print(evalString)
indexed_train_bin = train_data.map(parseRowIndexingBinary) indexed_test_bin = test_data.map(parseRowIndexingBinary) oneHot_train_bin = train_data.map(parseRowOneHotBinary) oneHot_test_bin = test_data.map(parseRowOneHotBinary) indexed_train_reg = train_data.map(parseRowIndexingRegression) indexed_test_reg = test_data.map(parseRowIndexingRegression) oneHot_train_reg = train_data.map(parseRowOneHotRegression) oneHot_test_reg = test_data.map(parseRowOneHotRegression) ## FEATURE SCALING ## label = oneHot_train_reg.map(lambda x: x.label) features = oneHot_train_reg.map(lambda x: x.features) scaler = StandardScaler(withMean=False, withStd=True).fit(features) data_temp_ = label.zip( scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) oneHot_train_reg_scaled = data_temp_.map(lambda x: LabeledPoint(x[0], x[1])) label = oneHot_test_reg.map(lambda x: x.label) features = oneHot_test_reg.map(lambda x: x.features) scaler = StandardScaler(withMean=False, withStd=True).fit(features) data_temp_ = label.zip( scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) oneHot_test_reg_scaled = data_temp_.map(lambda x: LabeledPoint(x[0], x[1])) ## CACHE-Y CACHE CACHE ## indexed_train_bin.cache() indexed_test_bin.cache() oneHot_train_bin.cache() oneHot_test_bin.cache() indexed_train_reg.cache()