def prepareData(sc): print 'import training data' rawDataWithHeader = sc.textFile(Path + 'train.tsv') print rawDataWithHeader.take(10) header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x != header) rData = rawData.map(lambda x: x.replace("\"","")) lines = rData.map(lambda x: x.split("\t")) print lines.count() categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap() print categoriesMap labelRDD = lines.map(lambda r: extractLabel(r)) featureRDD = lines.map(lambda r: extractFeatures(r,categoriesMap,len(r)-1)) # print featureRDD.take(1) stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) # print ScalerFeatureRDD.take(1) labelPoint = labelRDD.zip(ScalerFeatureRDD) labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0],r[1])) # print labelPointRDD.take(1) (trainData, testData, validationData) = labelPointRDD.randomSplit([8, 1, 1]) print trainData.count() print testData.count() print validationData.count() return (trainData, testData, validationData, categoriesMap)
def PrepareData(sc): #----------------------1.匯入並轉換資料------------- print("開始匯入資料...") rawDataWithHeader = sc.textFile(Path + "data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共計:" + str(lines.count()) + "筆") #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]------------- print "標準化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print(str(i) + ","), print "" print "標準化之後:", stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print(str(i) + ","), labelpoint = labelRDD.zip(ScalerFeatureRDD) labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以隨機方式將資料分為3部份並且回傳------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("將資料分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #回傳資料
def PrepareData(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") rawDataWithHeader = sc.textFile(Path+"data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x:x !=header) rData=rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]------------- print "标准化之前:", categoriesMap = lines.map(lambda fields: fields[3]). \ distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1)) for i in featureRDD.first(): print (str(i)+","), print "" print "标准化之后:", stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD) ScalerFeatureRDD=stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print (str(i)+","), labelpoint=labelRDD.zip(ScalerFeatureRDD) labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #----------------------3.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData:" + str(trainData.count()) + " validationData:" + str(validationData.count()) + " testData:" + str(testData.count())) return (trainData, validationData, testData, categoriesMap) #返回数据
def prepare_data(sc): #----------------------1.导入并转换数据------------- print("开始导入数据...") raw_data_with_header = sc.textFile(os.path.join(PATH, 'data/train.tsv')) header = raw_data_with_header.first() raw_data = raw_data_with_header.filter(lambda x: x!=header) # 去除 "" 按 \t 划分一个网页的不同字段 lines_rdd = raw_data.\ map(lambda x: x.replace("\"", "")).\ map(lambda x: x.split('\t')) print("共计: {}项".format(lines_rdd.count())) #---------------------2.数据标准化----------------------- # {新闻类别: 序号, } categories_map = lines_rdd.map(lambda fields: fields[3]).\ distinct().zipWithIndex().collectAsMap() label_rdd = lines_rdd.map(lambda r: get_label(r)) features_rdd = lines_rdd.map(lambda r: get_features(r, categories_map, len(r)-1)) scaler = StandardScaler(withMean=True, withStd=True).fit(features_rdd) stand_features = scaler.transform(features_rdd) #----------3.建立训练评估所需数据 RDD[LabeledPoint]------- LabeledPoint labeledpoint_rdd = label_rdd.zip(stand_features).map(lambda r: LabeledPoint(r[0], r[1])) #-----------4.以随机方式将数据分为3个部分并且返回------------- (trainData, validationData, testData) = labeledpoint_rdd.randomSplit([0.8, 0.1, 0.1]) print("将数据分trainData: {0}, validationData: {1}, testData: {2}".format( trainData.count(), validationData.count(), testData.count() )) return (trainData, validationData, testData, categories_map) #返回数据
def PrepareData(sc): rawDataWithHeader = sc.textFile(Path + "data/train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("total " + str(lines.count())) print("=======before standare========") categoriesMap = lines.map(lambda fields: fields[3]) \ .distinct() \ .zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print(str(i) + ", ") print("=======after standare========") stdScale = StandardScaler(withMean=True, withStd=True).fit(featureRDD) scaleFeatureRDD = stdScale.transform(featureRDD) for i in scaleFeatureRDD.first(): print(str(i) + ",") labelPoint = labelRDD.zip(scaleFeatureRDD) labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1])) (trainData, validationData, testData) = labelPointRDD.randomSplit([8, 1, 1]) return (trainData, validationData, testData, categoriesMap)
def PrepareData(sc): print("开始导入数据。。。") path = Path + "train.tsv" print(path) # 使用minPartitions=40,将数据分成40片,不然报错 rawDataWithHeader = sc.textFile(path, minPartitions=40) header = rawDataWithHeader.first() # 去掉首行,标题 rawData = rawDataWithHeader.filter(lambda x: x != header) # 去掉引号 rData = rawData.map(lambda x: x.replace("\"", "")) # 按照制表符分字段 lines = rData.map(lambda x: x.split("\t")) print("总共有:", str(lines.count())) #----2。创建训练所需的RDD数据 categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extractFeatures(r, categoriesMap, len(r) - 1)) print(featureRDD.first()) #----3.随机分成3部分数据返回 print("数据标准化之后===:") stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD) scalerFeatureRDD = stdScaler.transform(featureRDD) print(scalerFeatureRDD.first()) labelPoint = labelRDD.zip(scalerFeatureRDD) labelpointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1])) (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print("数据集划分为:trainData:", str(trainData.count()), "validationData:", str(validationData.count()), "testData:", str(testData.count())) return (trainData, validationData, testData, categoriesMap)
def test_model_setters(self): data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertIsNotNone(model.setWithMean(True)) self.assertIsNotNone(model.setWithStd(True)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
def test_model_transform(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
def test_model_transform(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
def getScaledData(data): features = data.map(lambda x: x.features) label = data.map(lambda x: x.label) scaler = StandardScaler(withMean=True, withStd=True).fit(features) scaled = label\ .zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))\ .map(lambda x: LabeledPoint(x[0], x[1])) return scaled
def test_model_setters(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertIsNotNone(model.setWithMean(True)) self.assertIsNotNone(model.setWithStd(True)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
def norm_train(self, train_data): train_features = train_data.map(lambda lp: lp.features) self.normalizer = StandardScaler().fit(train_features) # TODO: This can't be efficient... #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1])) labels = train_data.map(lambda lp: lp.label).collect() features = self.norm(train_features).collect() return get_df(zip( labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1]))
def normalizer(self): """ This function normalize the training data """ if self._typeNorm == 'norm': #Normalize input features RDD_X = self._data.map(lambda x: x[1]) self._scaler = StandardScaler(withMean=True, withStd=True).fit(RDD_X) RDD_X_norm = self._scaler.transform(RDD_X) RDD_Y = self._data.map(lambda x: x[0]) RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y) else: #Normalize input features RDD_X = self._data.map(lambda x: x[1]) self._scaler = StandardScaler(withMean=True, withStd=False).fit(RDD_X) RDD_X_norm = self._scaler.transform(RDD_X) if self._typeMVA == 'PCA': RDD_Y = self._data.map(lambda x: x[0]) RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y) else: RDD_Y_norm = self._data.map(lambda x: x[0]) # Create a new RDD of LabeledPoint data using the normalized features self._normdata = RDD_Y_norm.zip(RDD_X_norm)
def get_std_scaler(labeledpoints): std = StandardScaler() train_features = labeledpoints.map(lambda lp: lp.features) scaler_model = std.fit(train_features) transformed_features = scaler_model.transform(train_features) transformed_label_features = \ zip(labeledpoints.map(lambda lp: lp.label).collect(), transformed_features.collect()) return to_labeled_points(transformed_label_features), scaler_model
def TrainLRModel(trainData, iterations, step, miniBatchFraction): # Logistic Regression srcFeatures = trainData.map(lambda line: line.features) print srcFeatures.first() scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures) srcLabel = trainData.map(lambda line: line.label) scaledFeature = scaler.transform(srcFeatures) print scaledFeature.first() scaledData = srcLabel.zip(scaledFeature) trainData = scaledData.map( lambda (label, features): LabeledPoint(label, features)) model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \ miniBatchFraction = miniBatchFraction) return model
def training(model_directory, libsvm, scaler): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") training_rdd = MLUtils.loadLibSVMFile(sc, libsvm) training_rdd.cache() if scaler == '1': label = training_rdd.map(lambda x: x.label) features = training_rdd.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) data1 = label.zip(scaler1.transform(features)) # convert into labeled point data2 = data1.map(lambda x: LabeledPoint(x[0], x[1])) model_logistic = LogisticRegressionWithLBFGS.train(data2) else: model_logistic = LogisticRegressionWithLBFGS.train(training_rdd) model_logistic.save(sc, model_directory)
def PrepareData(sc): ''' 准备数据 :param sc: :return: (trainData, validationData, testData, categoriesMap) ''' print('======================= 准备数据 =======================') # ----------------------------- 1. 导入并转换数据 ----------------------------- print('========== [PrepareData] >>>> 开始导入 train.tsv 数据....') rawDataWithHeader = sc.textFile(Path + u'data/stumbleupon/train-100.tsv') header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace('\"', '')) lines = rData.map(lambda x: x.split('\t')) print('========== [PrepareData] >>>> 共计:' + str(lines.count()) + ' 项') # ----------------------------- 2. 建立训练评估所需数据RDD[LabeledPoint] ----------------------------- # categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() # labelpointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1))) print('========== [PrepareData] >>>> 标准化之前:'), categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) for i in featureRDD.first(): print('\t\t' + str(i) + '(' + str(type(i)) + '),'), print('') print('========== [PrepareData] >>>> 标准化之后:'), stdScaler = StandardScaler(withMean=False, withStd=True).fit( featureRDD ) # 创建标准化刻度,由于数值特征字段单位不同而数字差异很大,故无法比较,因此需要标准化处理。这里不使用平均值密集输出,使用稀疏数据,因此设置withMean=False ScalerFeatureRDD = stdScaler.transform(featureRDD) for i in ScalerFeatureRDD.first(): print('\t\t' + str(i) + '(' + str(type(i)) + '),'), labelpoint = labelRDD.zip( ScalerFeatureRDD) # 使用zip将label与标准化后的特征字段结合起来建立labelpoint labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) # ----------------------------- 3. 以随机方式将数据分为3个部分并返回 ----------------------------- (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1]) print('========== [PrepareData] >>>> 将数据以随机方式差分为三个部分:trainData: ' + str(trainData.count()) + ' 项, validationData: ' + str(validationData.count()) + ' 项, testData: ' + str(testData.count()) + ' 项') # ----------------------------- 4. 返回元组数据 ----------------------------- return (trainData, validationData, testData, categoriesMap)
def fit(self, dataset): """ Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos. :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet` """ if isinstance(dataset, LabeledDataSet): dataset = dataset.features if isinstance(dataset, pyspark.rdd.RDD): standarizer = StdSc(self.flag_mean, self.flag_std) self.model = standarizer.fit(dataset) else: if type(dataset) is not np.ndarray: dataset = np.array(dataset) if self.flag_mean is True: self.mean = dataset.mean(axis=0) if self.flag_std is True: self.std = dataset.std(axis=0, ddof=1) return
def fit(self, dataset): """ Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos. :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet` """ if isinstance(dataset, LabeledDataSet): dataset = dataset.features if isinstance(dataset, pyspark.rdd.RDD): standarizer = StdSc(self.flag_mean, self.flag_std) self.model = standarizer.fit(dataset) else: if type(dataset) is not np.ndarray: dataset = np.array(dataset) if self.flag_mean is True: self.mean = dataset.mean(axis=0) if self.flag_std is True: self.std = dataset.std(axis=0, ddof=1) return
def PrepareData(sc): #---------------------1. 导入并转换数据--------------------- global Path if sc.master[:5] == "local" or sc.master[:5] == "spark": Path = "file:/Users/johnnie/pythonwork/workspace/PythonProject/data/" else: Path = "hdfs://localhost:9000/user/hduser/test/data/" print("开始导入数据...") rawDataWithHeader = sc.textFile(Path + "train.tsv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) lines = rData.map(lambda x: x.split("\t")) print("共计:" + str(lines.count()) + "项") #---------------------2. 建立训练评估所需数据RDD[LabeledPoint]--------------------- print("标准化之前:") categoriesMap = lines.map( lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap() labelRDD = lines.map(lambda r: extract_label(r)) featureRDD = lines.map(lambda r: extract_features(r, categoriesMap, len(r) - 1)) print(featureRDD.first()) print("\n") print("标准化之后:") stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD) ScalerFeatureRDD = stdScaler.transform(featureRDD) print(ScalerFeatureRDD.first()) labelpoint = labelRDD.zip(ScalerFeatureRDD) # r[0]是label # r[1]是features labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1])) #---------------------3. 以随机方式将数据分为3个部分并返回--------------------- trainData, validationData, testData = labelpointRDD.randomSplit([8, 1, 1]) print("将数据分trainData: " + str(trainData.count()) + " validationData: " + str(validationData.count()) + " testData: " + str(testData.count())) return trainData, validationData, testData, categoriesMap
def extract_features(self, feat='tfidf', **kwargs): """ Converts each subtitle into its TF/TFIDF representation. Normalizes if necessary. Parameters -------- Feat: 'tf' or 'tfidf'. kwargs: num_features, minDocFreq, or other arguments to be passed to the MLLib objects. Returns -------- RDD of features with key. """ # transform BOW into TF vectors num_features = kwargs.get('num_features', 10000) htf = HashingTF(num_features) feat_rdd = self.RDD.mapValues(htf.transform).cache() # transform TF vectors into IDF vectors if feat == 'tfidf': keys, tf_vecs = feat_rdd.keys(), feat_rdd.values() minDocFreq = kwargs.get('minDocFreq', 2) idf = IDF(minDocFreq=minDocFreq) idf_model = idf.fit(tf_vecs) idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(idf_rdd) if self.model_type == 'log_reg': normalizer = StandardScaler(withMean=True, withStd=True) keys, vecs = feat_rdd.keys(), feat_rdd.values() norm_model = normalizer.fit(vecs) norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(norm_rdd) return feat_rdd
class StandardScalerNormalizer: def __init__(self): self.normalizer = None def norm_train(self, train_data): train_features = train_data.map(lambda lp: lp.features) self.normalizer = StandardScaler().fit(train_features) # TODO: This can't be efficient... #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1])) labels = train_data.map(lambda lp: lp.label).collect() features = self.norm(train_features).collect() return get_df(zip( labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1])) def norm(self, data): return self.normalizer.transform(data) def __str__(self): return 'StandardScaler'
def __init__(self): Dataset.__init__(self) trainDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_train.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.split(' ')[:-1]) labels = train.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(_[0], _[1])) testDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_test.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(_[0], _[1]))
def __init__(self): Dataset.__init__(self) # preparing the Data (Train and Test) : formatting and scaling then making it an RDD of LabeledPoints trainDirectory = HDFS_DIRECTORY + 'checkerboard2x2_train.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.split(' ')[:-1]) labels = train.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features))\ .map(lambda _: LabeledPoint(_[0], _[1])) testDirectory = HDFS_DIRECTORY + 'checkerboard2x2_test.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features))\ .map(lambda _: LabeledPoint(_[0], _[1])) ''' this block is for testing '''
def __init__(self): Dataset.__init__(self) trainDirectory = HDFS_DIRECTORY + 'striatum_train_mini.txt' train = sc.textFile(trainDirectory) features = train.map(lambda _: _.strip().split(' ')[:-1]) labels = train.map(lambda _: _.strip().split(' ')[-1]) scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.trainSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1])) testDirectory = HDFS_DIRECTORY + 'striatum_test_mini.txt' test = sc.textFile(testDirectory) features = test.map(lambda _: _.split(' ')[:-1]) labels = test.map(lambda _: _.split(' ')[-1]) # AN ISSUE HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # in original LAL code they scaled testset with the scaler fitted from TRAINING set, but why? scaler = StandardScaler(withMean=True, withStd=True).fit(features) self.testSet = labels.zip(scaler.transform(features)) \ .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))
# 24 = mode # 27 = tempo # 28 = time_signature allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data))) allData.take(3) # label data # only uses one feature for now # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]])) # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)])) labels = allData.map(lambda (tr, (rocks, data)): rocks) features = allData.map(lambda (tr, (rocks, data)): data) std = StandardScaler(True, True).fit(features) scaledFeatures = std.transform(features) labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data)) # uses all extracted # labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data])) labeledData.take(3) # make sample sizes equal labeledRock = labeledData.filter(lambda p: p.label == 1.0) labeledRock.count() labeledRock.map(lambda p: p.features[0]).mean() nrock = labeledRock.count()
parts = line.strip().split("::") return (int(parts[0]) - 1, int(parts[1]) - 1, float(parts[2])) #load in input file path = sys.argv[1] #path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/" data = MLUtils.loadLibSVMFile(sc, path) labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) #normalize: #scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included) scaler = StandardScaler(withMean=False, withStd=True).fit( features) #becomes dense if using withMean. may run out of memory locally #convert data to dense vector to be normalized #data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) data2 = labels.zip( scaler.transform(features)) #use this line if having memory issues #hide 10% of the data for final test data, test = data2.randomSplit([.9, .1]) #get size of chunks for 10-fold cross-validation num_folds = 10 partitionSize = (len(data.collect()) / num_folds ) #parameterize this value as num_folds (in loop as well) #train/validate 10 times on each k
df = sqlContext.createDataFrame(dictList) df.show() pdf = df.toPandas table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean) print table.values # For Testing #df.show() #df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show() df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi') #df.show() temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]])) # Scale the data features = df.map(lambda row: row[1:]) standardizer = StandardScaler() model = standardizer.fit(features) features_transform = model.transform(features) print features_transform.take(5) lab = df.map(lambda row: row[0]) transformedData = lab.zip(features_transform) transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]])) trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
print(model.predict(array([8.0, 0.0]))) #Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. from pyspark.mllib.feature import Normalizer from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.mllib.feature import StandardScaler sc = SparkContext() vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] dataset = sc.parallelize(vs) #all false, do nothing. standardizer = StandardScaler(False, False) model = standardizer.fit(dataset) result = model.transform(dataset) for r in result.collect(): print r print("\n") #deducts the mean standardizer = StandardScaler(True, False) model = standardizer.fit(dataset) result = model.transform(dataset) for r in result.collect(): print r print("\n")
def main(argv): verbose = False dbpath = '/root/data/AdditionalFiles/' tagstring = 'rock' usealldata = False holdout = 0.1 model_iterations = 100 model_step = 1.0 model_intercept = True # possible types logistic and svm model_type = 'logistic' try: opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"]) except getopt.GetoptError: print 'rockTag.py -d <data path> -t <tag string>' sys.exit(2) for opt, arg in opts: if opt == '-h': print('rockTag.py -d <data path> -t <tag string>') sys.exit() elif opt in ("-v", "--verbose"): verbose = True elif opt in ("-d", "--datapath"): dbpath = arg elif opt in ("-t", "--tagstring"): tagstring = str(arg).lower() elif opt in ("-a", "--alldata"): usealldata = True elif opt in ("-m", "--model"): if str(arg).lower() in ['logistic','svm']: model_type = str(arg).lower else: print('valid models are logistic and svm') sys.exit() elif opt in ("-s", "--step"): model_step = float(arg) elif opt in ("-i", "--iterations"): model_iterations = int(arg) elif opt in ("-o", "--holdout"): holdout = float(arg) if holdout <= 0 | holdout >= 1: print('holdout must be greater than 0 and less than 1') elif opt in ("-c", "--intercept"): model_intercept = True if verbose: print('data path: ' + dbpath) print('tag string: ' + tagstring) labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata) # scale features std = StandardScaler(True, True).fit(features) features = std.transform(features) # make labeled data labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data)) if verbose: labeledData.take(3) # rebalance samples equalSampleData = rebalanceSample(labeledData, verbose=verbose) # split data trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout]) if verbose: trainData.map(lambda p: (p.label, p.features)).take(3) # train model if model_type == 'logistic': model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) elif model_type == 'svm': model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) evalString = evaluateModel(model, testData) print(evalString)
from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import StandardScaler, StandardScalerModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # Without converting the features into dense vectors, transformation with zero mean will raise # exception on sparse vector. # data2 will be unit variance and zero mean. data2 = label.zip( scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each)
# step 1 - create spark context conf = SparkConf().setAppName("KMeans-Content")\ .set("spark.executor.memory","1g") sc = SparkContext() # step 2 - load in input file data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat") labels = data.map(lambda x:x.label) features = data.map(lambda x:x.features) # step 3 - standarize the data with unit values and 0 mean scaler = StandardScaler(withMean=False,withStd=True).fit(features) data2 = labels.zip(scaler.transform(features)) numFeatures = len(data2.values().take(10)[0]) print "Type of data2: ",type(data2) #RDD print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd print "Sample: ",data2.values().take(1)[0] # splitting up the data to training, validation and testing models. train,val,test = data2.randomSplit([.80,.10,.10]) print "Training Dataset Size:",train.count() print "Validation Dataset size:",val.count() print "Test Dataset Size:",test.count()
logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) def parsePoint(data): #return LabeledPoint(data[3],np.append(data[0:3],data[4:])) return LabeledPoint(data[0],data[1:]) # store the data from cassandra to a data frame and remove the NA value data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF() data=data.filter("year>0").na.drop() print data.count() # Scale the features with Standard Scaler data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column scaledData = scaler.transform(data2)# transform our data # Transform to a labelled vector parsedData = scaledData.map(parsePoint) # # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True) # Evaluate the model on training data print ("intercept",model.intercept) print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights) sc.stop()
## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while ## uncorrelated Uniform needs a rotation to get there. By removing correlation ## in the gaussian case, we have achieved independence between variables. ## If the source variables are gaussian ICA is not required and PCA is sufficient. # Code for PCA and whitening the dataset. from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix from pyspark.mllib.feature import StandardScaler from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix from sklearn import datasets # create the standardizer model for standardizing the dataset X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) ) scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy() num_rows = X_rm.numRows()
from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import StandardScaler, StandardScalerModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # data2 will be unit variance and zero mean. data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect():
# This should be the maximum possible time max_time = 23 * 3600 + 59 * 60 + 59 #max_time = 16 * 60 low = 0 high = 15 * 60 modelList = [] while low < max_time: # Temp should run once timeseries = df.filter(lambda x: low < x.timestamp < high) #if timeseries.count() > 0: features = timeseries.map(lambda row: row[1:]) #print "Possible points" #print features.collect() model = StandardScaler().fit(features) features_t = model.transform(features) label = timeseries.map(lambda row: row[0]) labeled_data = label.zip(features_t) final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1])) model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True) #model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True) #model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True) modelList.append(model) #print "" #print "Model1 weights " + str(model.weights)
sc = SparkContext(conf=conf) sc.setLogLevel("warn") user_map = load_user_map(sc) # 加载训练数据 train_data = load_train_data(sc) # 设置数据的用户信息数据 train_data_user_info = set_train_user_info(train_data, user_map) # user_id merchant_id age_range gender label train_data_user_info.cache() stand_train_data_user_info = train_data_user_info.map( lambda user: user[0:4]) stand_train_data_user_info_label = train_data_user_info.map( lambda user: user[4]) #训练数据标准化 std_scaler = StandardScaler(True, True).fit(stand_train_data_user_info) stand_train_data_user_info = std_scaler.transform( stand_train_data_user_info) train_data_user_info = stand_train_data_user_info_label.zip( stand_train_data_user_info) # 构建标签数据 train_data_user_info = build_point(train_data_user_info) numIterations = 100 train_data_user_info.cache() #训练模型 model = SVMWithSGD.train(train_data_user_info, numIterations) #model = DecisionTree.trainClassifier(train_data_user_info,numIterations,2,{}) # 加载测试数据
# 예제 11-9 파이썬에서 벡터 정량화 from pyspark.mllib.feature import StandardScaler vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])] dataset = sc.parallelize(vectors) scaler = StandardScaler(withMean=True, withStd=True) model = scaler.fit(dataset) result = model.transform(dataset) # 결과: {[-0.7071, 0.7071, 0.0], [0.7071, -0.7071, 0.0])
parts = line.strip().split("::") return (int(parts[0])-1, int(parts[1])-1, float(parts[2])) #load in input file path = sys.argv[1] #path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/" data = MLUtils.loadLibSVMFile(sc, path) labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) #normalize: #scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included) scaler = StandardScaler(withMean = False, withStd = True).fit(features) #becomes dense if using withMean. may run out of memory locally #convert data to dense vector to be normalized #data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray())))) data2 = labels.zip(scaler.transform(features)) #use this line if having memory issues #hide 10% of the data for final test data, test = data2.randomSplit([.9, .1]) #get size of chunks for 10-fold cross-validation num_folds = 10 partitionSize = (len(data.collect())/num_folds) #parameterize this value as num_folds (in loop as well) #train/validate 10 times on each k i = 0 j = partitionSize
# print the top line of each RDD to confirm that the transformation was successful weighted = ep.transform(vecrdd) print weighted.take(1) print vecrdd.take(1) # call the colStats method of the Statistics object on vecrdd and print the # mean, variance, and number of non-zero values stats = Statistics.colStats(vecrdd) print stats.mean() print stats.variance() print stats.numNonzeros() # instantiate a StandardScaler object and set withMean and withStd to 'True' ss = StandardScaler(withMean=True, withStd=True) # call the fit method of the StandardScaler object to create a StandardScalerModel model = ss.fit(vecrdd) # call the transform method of the StandardScalerModel to center and scale the data # in vecrdd RDD scaled = model.transform(vecrdd) # call colStats method of the Statistics object and print the mean, variance, # and number of non-zero values to confirm that vecrdd was scaled and centered scaledStats = Statistics.colStats(scaled) print scaledStats.mean() print scaledStats.variance() print scaledStats.numNonzeros()
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
print("Loading RAW data...") raw_data = sc.textFile(data_file) labels = raw_data.map(lambda line: line.strip().split(",")[-1]) # Prepare data for clustering input # the data contains non-numeric features, we want to exclude them since # k-means works with numeric features. These are the first three and the last # column in each data row print("Parsing dataset...") parsed_data = raw_data.map(parse_interaction) parsed_data_values = parsed_data.values().cache() # Standardize data print("Standardizing data...") standardizer = StandardScaler(True, True) standardizer_model = standardizer.fit(parsed_data_values) standardized_data_values = standardizer_model.transform(parsed_data_values) # Evaluate values of k from 5 to 40 print( "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}) scores = map(lambda k: clustering_score(standardized_data_values, k), range(10, max_k + 1, 10)) # Obtain min score k min_k = min(scores, key=lambda x: x[2])[0] print("Best k value is %(best_k)d" % {"best_k": min_k}) # Use the best model to assign a cluster to each datum
label_counts = labels.countByValue() sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True)) for label, count in sorted_labels.items(): print label, count # Prepare data for clustering input # the data contains non-numeric features, we want to exclude them since # k-means works with numeric features. These are the first three and the last # column in each data row print "Parsing dataset..." parsed_data = raw_data.map(parse_interaction) parsed_data_values = parsed_data.values().cache() # Standardize data print "Standardizing data..." standardizer = StandardScaler(True, True) standardizer_model = standardizer.fit(parsed_data_values) standardized_data_values = standardizer_model.transform(parsed_data_values) # Evaluate values of k from 5 to 40 print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k} scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10)) # Obtain min score k min_k = min(scores, key=lambda x: x[2])[0] print "Best k value is %(best_k)d" % {"best_k": min_k} # Use the best model to assign a cluster to each datum # We use here standardized data - it is more appropriate for exploratory purposes print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k} best_model = min(scores, key=lambda x: x[2])[1]
#Section 7.4.4 from pyspark.mllib.regression import LabeledPoint def toLabeledPoint(x): a = x.toArray() return LabeledPoint(a[-1], Vectors.dense(a[0:-1])) housingData = housingVals.map(toLabeledPoint) #Section 7.4.5 sets = housingData.randomSplit([0.8, 0.2]) housingTrain = sets[0] housingValid = sets[1] #Section 7.4.6 from pyspark.mllib.feature import StandardScaler scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features)) trainLabel = housingTrain.map(lambda x: x.label) trainFeatures = housingTrain.map(lambda x: x.features) validLabel = housingValid.map(lambda x: x.label) validFeatures = housingValid.map(lambda x: x.features) trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) #Section 7.5 from pyspark.mllib.regression import LinearRegressionWithSGD alg = LinearRegressionWithSGD() trainScaled.cache() validScaled.cache() model = alg.train(trainScaled, iterations=200, intercept=True) #Section 7.5.1
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. from pyspark.mllib.feature import Normalizer from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.mllib.feature import StandardScaler sc = SparkContext() vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] dataset = sc.parallelize(vs) #all false, do nothing. standardizer = StandardScaler(False, False) model = standardizer.fit(dataset) result = model.transform(dataset) for r in result.collect(): print r print("\n") #deducts the mean standardizer = StandardScaler(True, False) model = standardizer.fit(dataset) result = model.transform(dataset) for r in result.collect(): print r print("\n") #divides the length of vector
def norm(features): scaler = StandardScaler(withMean=False, withStd=False).fit(features) return scaler.transform(features)
# Size: shape = reader.shape(flist[0]) shape['n_samples_per_file'] = shape['n_samples'] shape['n_samples'] = shape['n_samples'] * len(flist) print "Will load a dataset of size:\n\t", shape rdd_data = sc.parallelize(flist).flatMap(reader('TEMP')) first = rdd_data.first() # In[Scaling]: # Compute scaling parameters: from pyspark.mllib.feature import StandardScaler, StandardScalerModel scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data) sample_mean = scaler.call('mean') # Effectively scale the dataset: rdd_norm = scaler.transform(rdd_data) # In[Reduction]: # Compute PCA new dimensions: from pyspark.mllib.feature import PCA as PCAmllib Neof = 20 reducer = PCAmllib(Neof).fit(rdd_norm) # print type(reducer)