def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def test_equals(self): indices = [1, 2, 4] values = [1., 3., 2.] self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_append_bias_with_sp_vector(self): data = Vectors.sparse(3, {0: 2.0, 2: 2.0}) expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0}) # Returned value must be SparseVector ret = MLUtils.appendBias(data) self.assertEqual(ret, expected) self.assertEqual(type(ret), SparseVector)
def test_nnclassifier_in_pipeline(self): if self.sc.version.startswith("1"): from pyspark.mllib.linalg import Vectors df = self.sqlContext.createDataFrame( [(Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), (Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), ], ["features", "label"]) scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled") model = Sequential().add(Linear(2, 2)) criterion = ClassNLLCriterion() classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\ .setBatchSize(4) \ .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled") pipeline = Pipeline(stages=[scaler, classifier]) pipelineModel = pipeline.fit(df) res = pipelineModel.transform(df) assert type(res).__name__ == 'DataFrame'
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def test_model_transform(self): weight = Vectors.dense([3, 2, 1]) densevec = Vectors.dense([4, 5, 6]) sparsevec = Vectors.sparse(3, [0], [1]) eprod = ElementwiseProduct(weight) self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) self.assertEqual( eprod.transform(sparsevec), SparseVector(3, [0], [3]))
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
def test_parse_vector(self): a = DenseVector([3, 4, 6, 7]) self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]') self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])') self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
def _get_train_data(self): sql_context = SQLContext(self.sc) l = [ (1, Vectors.dense([1, 2, 3]), 1.0), (2, Vectors.dense([1, 2, 3]), 0.0), (3, Vectors.dense([1, 2, 3]), 1.0), (4, Vectors.dense([1, 2, 3]), 0.0), ] return sql_context.createDataFrame(l, ['id', 'features', 'label'])
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "prediction"])
def test_idf_model(self): data = [ Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) ] model = IDF().fit(self.sc.parallelize(data, 2)) idf = model.idf() self.assertEqual(len(idf), 11)
def load_data_rdd(csv_file, shuffle=True, train=True): if shuffle: shuffle_csv(csv_file) data = sc.textFile(data_path + csv_file) data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)), str(line[-1]).replace('Class_', '')) ) else: data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") ) return data
def parseEntry(xx): mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1) xx=xx.split('\t') a_virtual=xx[0] browser=xx[1] referrer=xx[2] a_user_key=xx[3] try: birthyear=int(xx[4]) age=2015-birthyear except Exception as _: birthyear=xx[4] age=-1 gender=xx[5] #print(xx) #print(xx[6]) if xx[6]!='NAN': reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d') else: reg_date=mindate device=xx[7] date=datetime.datetime.strptime(xx[8],'%d-%m-%Y') tdiff=datetime.timedelta(hours=int(xx[9])) date=date+tdiff year=date.year month=date.month day=date.day hour=int(xx[9]) weekday=date.weekday() if reg_date>mindate: days_since_registration=(date-reg_date).days else: days_since_registration=-1 metrics=list([int(x.replace(',0','')) for x in xx[10:]]) visits=metrics[0] visits_betalt=metrics[1] pageviews=metrics[2] pageview_nothome=metrics[3] pageview_betalt=metrics[4] timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)]) timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)]) return Row(browser=browser,a_user_key=a_user_key,age=age,\ day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\ pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\ device=device,gender=gender,days_since_registration=days_since_registration,\ reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\ a_virtual=a_virtual)
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def load_data_frame(csv_file, shuffle=True, train=True): if shuffle: shuffle_csv(csv_file) data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), 'class_'+str(line[0]),int(line[0])) ) else: # Test data gets dummy labels. We need the same structure as in Train data data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) return sqlcontext.createDataFrame(data, ['features', 'category','label'])
def create_rows_for_rdd(x): """ :param x: :return: """ features = list(x[1]) l = len(features) - 1 label = float(features.pop(l)) meta_data = x[0] return Row(label=label, features=Vectors.dense(features), meta_data=Vectors.dense(meta_data))
def remove_time_dependent_effects(self, ts): """ Given a timeseries, apply inverse operations to obtain the original series of underlying errors. Parameters ---------- ts: Time series of observations with this model's characteristics as a Numpy array returns the time series with removed time-dependent effects as a Numpy array """ destts = Vectors.dense(np.array([0] * len(ts))) result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def add_time_dependent_effects(self, ts): """ Given a timeseries, apply a model to it. Parameters ---------- ts: Time series of i.i.d. observations as a Numpy array returns the time series with added time-dependent effects as a Numpy array. """ destts = Vectors.dense([0] * len(ts)) result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]),), (Vectors.sparse(2, {0:0.3, 1:0.4}),), (Vectors.sparse(2, {0:0.5, 1:0.6}),)] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue(isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0,0], 0.1) self.assertEqual(pd.features[0][0,1], 0.2)
def add_svec(sv1, sv2): assert len(sv1) == len(sv2), "dimension mismatch" indices = [] values = [] i, j = 0, 0 while i < len(sv1.indices) and j < len(sv2.indices): if sv1.indices[i] == sv2.indices[j]: indices.append(sv1.indices[i]) values.append(sv1.values[i] + sv2.values[j]) i += 1 j += 1 elif sv1.indices[i] < sv2.indices[j]: indices.append(sv1.indices[i]) values.append(sv1.values[i]) i += 1 else: indices.append(sv2.indices[j]) values.append(sv2.values[j]) j += 1 while i < len(sv1.indices): indices.append(sv1.indices[i]) values.append(sv1.values[i]) i += 1 while j < len(sv2.indices): indices.append(sv2.indices[j]) values.append(sv2.values[j]) j += 1 return Vectors.sparse(len(sv1), indices, values)
def save_pca_parameters(pca_model, data_dim): # since there's no good way of doing it in python, simply use an I matrix to retrieve features = [(Vectors.dense(x),) for x in np.eye(data_dim).tolist()] params = pca_embed(sqlContext.createDataFrame(features, ('features',)), pca_model) np.savetxt(PCA_OUT_PATH, np.matrix(params.select('pca').rdd.map(lambda r: r[0]).collect()), fmt='%.6f')
def forecast(self, ts, nfuture): """ Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current model parameters, and then provide `nFuture` periods of forecast. We assume AR terms prior to the start of the series are equal to the model's intercept term (or 0.0, if fit without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If there is differencing, the first d terms come from the original series. Parameters ---------- ts: Timeseries to use as gold-standard. Each value (i) in the returning series is a 1-step ahead forecast of ts(i). We use the difference between ts(i) - estimate(i) to calculate the error at time i, which is used for the moving average terms. Numpy array. nFuture: Periods in the future to forecast (beyond length of ts) Returns a series consisting of fitted 1-step ahead forecasts for historicals and then `nFuture` periods of forecasts. Note that in the future values error terms become zero and prior predictions are used for any AR terms. """ jts = _py2java(self._ctx, Vectors.dense(ts)) jfore = self._jmodel.forecast(jts, nfuture) return _java2py(self._ctx, jfore)
def to_vector(np_array): ''' Convert numpy array to MLlib Vector ''' if len(np_array.shape) == 1: return Vectors.dense(np_array) else: raise Exception("""An MLLib Vector can only be created from a one-dimensional numpy array""")
def buildLabeledPoint(s, classification): features=[] for attr in attributes: features.append(getattr(s, attr + '_1')) for attr in attributes: features.append(getattr(s, attr + '_2')) return LabeledPoint(classification,Vectors.dense(features))
def createSparseVector(histogram): indexList = [] countList = [] for histogramIndex, count in sorted(histogram, key=getKey): indexList.append(histogramIndex) countList.append(count) return Vectors.sparse(2000, indexList,countList)
def scoreOnePoint(self, x): """ Compute the log likelihood of 'x' being generated under the current model Also returns the probability that 'x' is generated by each component of the mixture Parameters ---------- x : array of shape (1, n_dim) Corresponds to a single data point. Returns ------- log_likelihood_x :Log likelihood of 'x' prob_x : Resposibility of each cluster for the data point 'x' """ lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights)) log_likelihood_x = logsumexp(lpr) prob_x = np.exp(lpr-log_likelihood_x) if self.isSparse == 1: temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :]) sqVec = Vectors.sparse(x.size, x.indices, x.values**2) temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :]) else: temp_wt = np.dot(prob_x.T[:, np.newaxis], x[np.newaxis, :]) temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :]) return log_likelihood_x, prob_x, temp_wt, temp_avg
def load_cut_to_rdd(input_file, result_file): sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050") lines = sc.textFile(input_file) data = lines.map(parseKV).cache() doc_term_tf = data.reduceByKey(add).cache() num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count() terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect() num_term = len(terms_list) term_idf = doc_term_tf.map( lambda ((tid, term), tf): (term, 1.0) ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1))) tfidf_join = doc_term_tf.map( lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf) tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf))) doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache() nonzero_count = 0 f = open(result_file,'w') f.write('%s %s\r\n'%(num_doc, num_term)) for (tid, feature) in doc_vec.collect(): for num in feature: f.write(str(num)+"\t") f.write("\n") f.close() sc.stop() return
decisionTree_model_evaluator = RegressionEvaluator( labelCol="MPG", predictionCol="prediction", metricName="rmse") rmse = decisionTree_model_evaluator.evaluate( decisionTree_model_predictions) print( "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" % rmse) r2_dt = ecisionTree_model_evaluator = RegressionEvaluator( labelCol="MPG", predictionCol="prediction", metricName="r2") print("R Squared (R2) for Decision Tree on test data = %g" % r2_dt.evaluate(decisionTree_model_predictions)) ############################---RANDOM FOREST REGRESSION---################################## train_rdd_rf = train_df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) test_rdd_rf = test_df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) RandomForest_model = RandomForest.trainRegressor( train_rdd_rf, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", maxDepth=10, maxBins=100) predictions = RandomForest_model.predict( test_rdd_rf.map(lambda x: x.features)) labelsAndPredictions = test_rdd_rf.map(lambda lp: lp.label).zip( predictions)
def parse(lp): label = float(lp[lp.find('(') + 1:lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(',')) return LabeledPoint(label, vec)
import sys # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, sys.argv[1]) label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # data2 will be unit variance and zero mean. data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect(): print(each) sc.stop()
df_train.write.options( header="true").csv("hdfs://node1:9000/user/root/exp4/procd_train_real.csv") df_train.write.parquet( "hdfs://node1:9000/user/root/exp4/procd_train_real.parquet") # %% #填充缺失值 #第一种策略是将后8个特征所有null值填充为0 df_train_filled = df_train.fillna(0) df_train_filled.show() # %% #将数据转为合适的格式 from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors #先转成RDD df_train_rdd = df_train_filled.rdd #改成(label,features)的格式 df_train_rdd = df_train_rdd.map( lambda line: LabeledPoint(line[2], Vectors.dense(line[3:]))) # %% #保存为LibSVMFile格式,方便后面训练使用 from pyspark.mllib.util import MLUtils MLUtils.saveAsLibSVMFile(df_train_rdd, "hdfs://node1:9000/user/root/exp4/procd_train_real") # %% #别忘了关掉session spark.stop()
from test_helper import Test Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0], 'incorrect value for irisDFZeroIndex') # COMMAND ---------- # MAGIC %md # MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`. We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`. To do that we'll need to create a `udf` and apply it to our dataset. Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction). # MAGIC # MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method. # COMMAND ---------- # ANSWER from pyspark.sql.functions import udf # Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types # VectorUDT should be the return type of the udf from pyspark.mllib.linalg import Vectors, VectorUDT # Take the first two values from a SparseVector and convert them to a DenseVector firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT()) irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache() display(irisTwoFeatures) # COMMAND ---------- # TEST Test.assertEquals(str(irisTwoFeatures.first()), 'Row(features=DenseVector([-0.5556, 0.25]), label=0.0)', 'incorrect definition of firstTwoFeatures')
# See the xyz coordinates of each atom in the file t.xyz # Find the current shape of the data t.xyz.shape # Get the first 1000 frames of xyz data t_1k = t.xyz[0:1000] # Convert into spark RDD to run PCA using ML data = [] # try to find a way to optimize the vectorization from pyspark.mllib.linalg import Vectors for frame in t_1k: for atom in frame: data.append((Vectors.dense(atom),)) # Next, apply PCA with the following: from pyspark.ml.feature import PCA df = sqlContext.createDataFrame(data, ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(df) model.transform(df).collect()[0].pca_features data = [(Vectors.dense([1.0, 0.0]),), (Vectors.dense([0.0, -1.0]),)] ### NEW PCA MODEL TO GET COMPONENTS AND EIGENVALUES import numpy as np
lambda x: x).distinct().collect() featur_index = {v: index for index, v in enumerate(featurs, 1)} featur_index_value = sc.broadcast(featur_index).value chi_index_map = {v: index for index, v in enumerate(chi_index, 1)} chi_index_value = sc.broadcast(chi_index_map).value rdd.map(lambda x: x.label + ' ' + get_feature_index( x.feature, featur_index_value)).saveAsTextFile('/user/zlj/tmp/cat3_libsvm') rdd.map(lambda x: x.tel + ' ' + get_feature_index( x.feature, featur_index_value)).saveAsTextFile( '/user/zlj/tmp/cat3_libsvm_tel') lp=rdd.map(lambda x:x.label+' '+get_feature_index(x.feature,featur_index_value))\ .map(lambda x:MLUtils._parse_libsvm_line(x))\ .map(lambda x:LabeledPoint(x[0],Vectors.sparse(40000, x[1], x[2]))) model = ChiSqSelector(100).fit(lp) lp.map(lambda x: (x[0], model.transform(x[1]))) model.transform(lp) sc.parallelize( sc.textFile('/user/zlj/tmp/cat3_libsvm/part-00092').take(30) [0]).saveAsTextFile('/user/zlj/tmp/test1') values = MLUtils._parse_libsvm_line(t1.take(20)[3])[1] def check(value): size = len(value)
def fit(self, data, n_components, n_iter, ct): """ Estimate model parameters with the expectation-maximization algorithm. Parameters ---------- data - RDD of data points n_components - Number of components n_iter - Number of iterations. Default to 100 Attributes ---------- covariance_type : Type of covariance matrix. Supports only diagonal covariance matrix. ct : Threshold value to check the convergence criteria. Defaults to 1e-3 min_covar : Floor on the diagonal of the covariance matrix to prevent overfitting. Defaults to 1e-3. converged : True once converged False otherwise. Weights : array of shape (1, n_components) weights for each mixture component. Means : array of shape (n_components, n_dim) Mean parameters for each mixture component. Covars : array of shape (n_components, n_dim) Covariance parameters for each mixture component """ sc = data.context covariance_type = 'diag' converged = False self.min_covar = 1e-3 # observation statistics self.s0 = 0 self.s1 = 0 # To get the no of data points n_points = data.count() # To get the no of dimensions n_dim = data.first().size if (n_points == 0): raise ValueError('Dataset cannot be empty') if (n_points < n_components): raise ValueError( 'Not possible to make (%s) components from (%s) datapoints' % (n_components, n_points)) # Initialize Covars(diagonal covariance matrix) if hasattr(data.first(), 'indices'): self.isSparse = 1 def convert_to_kvPair(eachV): g = [] for i in range(eachV.indices.size): g.append( (eachV.indices[i], (eachV.values[i], eachV.values[i] * eachV.values[i]))) return g def computeVariance(x): mean = x[1][0] / n_points sumSq = x[1][1] / n_points return x[0], sumSq - mean * mean cov = [] kvPair = data.flatMap(convert_to_kvPair) res = kvPair.reduceByKey(np.add).map(computeVariance) cov = Vectors.sparse(n_dim, res.collectAsMap()).toArray() + 1e-3 self.Covars = np.tile(cov, (n_components, 1)) else: self.isSparse = 0 cov = [] for i in range(n_dim): cov.append( data.map(lambda m: m[i]).variance() + self.min_covar) self.Covars = np.tile(cov, (n_components, 1)) # Initialize Means using MLlib KMeans self.Means = np.array(KMeans().train(data, n_components).clusterCenters) # Initialize Weights with the value 1/n_components for each component self.Weights = np.tile(1.0 / n_components, n_components) # EM algorithm # loop until number of iterations or convergence criteria is satisfied for i in range(n_iter): logging.info("GMM running iteration %s " % i) # broadcasting means,covars and weights self.meansBc = sc.broadcast(self.Means) self.covarBc = sc.broadcast(self.Covars) self.weightBc = sc.broadcast(self.Weights) # Expectation Step EstepOut = data.map(self.scoreOnePoint) # Maximization step MstepIn = EstepOut.reduce(lambda (w1, x1, y1, z1), ( w2, x2, y2, z2): (w1 + w2, x1 + x2, y1 + y2, z1 + z2)) self.s0 = self.s1 self.mStep(MstepIn[0], MstepIn[1], MstepIn[2], MstepIn[3]) # Check for convergence. if i > 0 and abs(self.s1 - self.s0) < ct: converged = True logging.info("Converged at iteration %s" % i) break return self
def load_cut_to_rdd(input_file, result_file, cluster_num=CLUSTER_NUM, clu_iter=CLUSTERING_ITER,\ ini_iter=INITIAL_ITER, rb_iter=RB_ITER, con_dist=convergeDist, filter_scale=FILTER_SCALE): sc = SparkContext(appName='PythonKMeans', master="mesos://219.224.135.91:5050") lines = sc.textFile(input_file) data = lines.map(parseKV).cache() doc_term_tf = data.reduceByKey(add).cache() num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count() initial_term_idf = doc_term_tf.map(lambda ((tid, term), tf): (term, 1.0)).reduceByKey(add) # filter initial_num_term = initial_term_idf.count() print 'initial_num_term', initial_num_term idf_sum = initial_term_idf.values().sum() print 'idf_sum', idf_sum idf_average = idf_sum / (initial_num_term * filter_scale) term_idf = initial_term_idf.filter( lambda (term, idf): idf_average < idf < (idf_average * (filter_scale - 1))).mapValues( lambda idf: math.log(float(num_doc) / (idf + 1))) terms_list = term_idf.keys().collect() num_term = len(terms_list) print 'num_term', num_term tfidf_join = doc_term_tf.map(lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf) tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf * idf))) doc_vec = tfidf.groupByKey().mapValues(lambda feature: csr_matrix( Vectors.sparse(num_term, feature).toArray())).cache() global_center = doc_vec.mapValues(lambda x: x / num_doc).values().reduce( add) g_length = vector_length(global_center) # initial 2-way clustering maximum_total_variance = 0 best_kPoints = [] print 'initial', now() for i in range(ini_iter): kPoints, tempDist, iter_count = clustering(doc_vec, K, con_dist, clu_iter) # evaluation cluster_variance, total_variance = cluster_evaluation(doc_vec, kPoints) ex_value = external_evaluation(kPoints, global_center, g_length) obj_value = total_variance[0] / ex_value # choose the best initial cluster if obj_value > maximum_total_variance: maximum_total_variance = obj_value best_kPoints = kPoints # global_distance = sum(cosine_dist(best_kPoints[x][1], global_center, best_kPoints[x][2], g_length) for x in range(len(best_kPoints))) f = open(result_file, 'w') f.write( str(iter_count) + "\t" + str(num_doc) + "\t" + str(num_term) + "\n") for index in range(len(terms_list)): f.write(terms_list[index].encode('utf-8') + '\t') """ for (term, ((tid,tf), idf)) in tfidf_join.collect(): f.write(term.encode('utf-8')+'\t'+str(tid)+'\t'+str(tf)+'\t'+str(idf)+'\n') print >> f, "%0.9f" % tempDist print >> f, "total_variance", total_variance[0], total_variance[1] print >> f, "global_dist", global_distance f.write("center:"+"\t") for dim in global_center: f.write(str(dim)+"\t") f.write("\n") for i in range(len(best_kPoints)): f.write(str(i)) for unit in best_kPoints[i][1]: f.write("\t") f.write(str(unit)) f.write("\n") for (index, (dist, num)) in cluster_variance.collect(): f.write(str(index)) f.write("\t") f.write(str(dist)) f.write("\t") f.write(str(num)) f.write("\n") """ f.close() #repeated bisect #choose cluster updated_dict = {} updated_points_dict = {} total_delta_variance = 0 updated_dict[total_delta_variance] = doc_vec updated_points_dict[total_delta_variance] = best_kPoints print 'repeated', now() for j in range(2, cluster_num + 1): if not (total_delta_variance in updated_dict): print "no cluster to divide" break print 'cluster to divide', total_delta_variance, updated_dict[ total_delta_variance] best_cluster = updated_dict[total_delta_variance] global_best_kPoints = updated_points_dict[total_delta_variance] del updated_dict[total_delta_variance] del updated_points_dict[total_delta_variance] closest = best_cluster.map(lambda (tid, feature): (closestPoint( feature, global_best_kPoints), (tid, feature))).cache() print 'total_count', closest.count() total_delta_variance = float("-inf") # clear to zero for key in updated_dict: if key > total_delta_variance: total_delta_variance = key for i in range(K): single_cluster = closest.filter( lambda (index, (tid, feature)): index == i).values().cache() print 'count', i, single_cluster.count() maximum_total_variance = 0 best_kPoints = [] in_value = cal_cluster_variance(single_cluster) ex_value = cosine_dist(global_best_kPoints[i][1], global_center, global_best_kPoints[i][2], g_length) initial_distance = in_value / ex_value for j in range(rb_iter): # clustering kPoints, tempDist, iter_count = clustering( single_cluster, K, con_dist, clu_iter) # evaluation cluster_variance, total_variance = cluster_evaluation( single_cluster, kPoints) ex_value = external_evaluation(kPoints, global_center, g_length) obj_value = total_variance[0] / ex_value if obj_value > maximum_total_variance: maximum_total_variance = obj_value best_kPoints = kPoints improvement = maximum_total_variance - initial_distance updated_dict[improvement] = single_cluster # update dict updated_points_dict[improvement] = best_kPoints print 'improvement', improvement, maximum_total_variance, initial_distance if improvement > total_delta_variance: total_delta_variance = improvement print 'length', cluster_variance.count() count = 0 for key in updated_dict: count += 1 print 'key', key per_cluster = updated_dict[key] total_similarity = cal_cluster_variance(per_cluster) f = open('results/cluster_' + str(count), 'w') print >> f, key, total_similarity results_list = per_cluster.values().reduce(add).toarray() for row in results_list: for index in range(len(row)): value = row[index] if value != 0: f.write('(' + str(index) + ',' + str(value) + ')\t') f.write('\n') for (tid, feature) in per_cluster.collect(): f.write(tid) """ for row in feature.toarray(): for unit in range(len(row)): f.write('\t') f.write(str(row[unit])) """ f.write('\n') f.close() sc.stop() return
from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PolynomialExpansionExample")\ .getOrCreate() # $example on$ df = spark\ .createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ spark.stop()
for i in range(1, k): if 'f:'+str(i) in line: indexList.append(i) valList.append(line['f:'+str(i)]) label = int(line['l:'+str(col)]) if label == -1: label = 0 features.append((Vectors.sparse(k, indexList, valList),label)) features = sc.parallelize(features) #sclines = sc.parallelize(lines) #features = sclines.map(featuresToSparseVecFromLine) featureDataFrame = spark.createDataFrame(features, ["features", "label"]) pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures") model = pca.fit(featureDataFrame) #pcaresult = model.transform(featureDataFrame).select("pcaFeatures").collect() #lp = [] #c = 0 #for com in pcaresult: # lp.append(LabeledPoint(lines[c]['l:' + str(col)], mllibVectors.fromML(com.pcaFeatures))) # c += 1 #lp = sc.parallelize(lp) pcaresult = model.transform(featureDataFrame).rdd lp = pcaresult.map(lambda r: LabeledPoint(r.label, mllibVectors.fromML(r.pcaFeatures))) model = SVMWithSGD.train(lp) model.save(sc, "svm/SVM" + str(col)) labelsAndPreds = lp.map(lambda p: (p.label, model.predict(p.features))) err = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("err at node " + str(col) + " = " + str(err)) sc.stop()
wordsFiltered.append(w) txt = " ".join(wordsFiltered).lower() data = sc.parallelize([ txt ]).zipWithIndex().map(lambda val: Row(idd=val[1], words=val[0].split(" "))) docDF = spark.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select( "idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3, maxIterations=700, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers):
sc = SparkContext(conf=conf) # row_data = sc.textFile( "/user-program/python/MachineLearningSpark/Data/ml-100k/u.data") row_ratings = row_data.map(lambda line: line.split('\t')).map( lambda r: Rating(int(r[0]), int(r[1]), float(r[2]))) print(row_ratings.first()) # row_ratings.cache() # als_model = ALS.train(row_ratings, 50, 10, 0.1) movie_factors = als_model.productFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) movie_vectors = movie_factors.map(lambda (id, vector): vector) #print(movie_vectors.first()) user_factors = als_model.userFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) user_vectors = user_factors.map((lambda (id, vector): vector)) #print(user_vectors.first()) # train movie_cluster_model = KMeans().train(movie_vectors, k=5, maxIterations=10, runs=3) print("movie cluster model kmeans :") print(movie_cluster_model) user_cluster_model = KMeans().train(user_vectors,
def parseTrainingData(line): cell = line.split(",") return Vectors.dense([float(cell[0]), float(cell[1])])
def __str__(self): return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"
.appName("KMeans") \ .config("spark.some.config.option", "Angadpreet-KMeans") \ .getOrCreate() today = dt.datetime.today() spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_user.json").select( "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[ 0], x[1], (today - par.parse(x[2])).days)).collect()[:1200]) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") # Getting the input data trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: Vectors.dense(x)) # Initialize GMM start = timer() gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018) end = timer() print(end - start) df = pandas.DataFrame({'features': [], 'cluster': []}) i = 0 for v in vector_df.collect(): df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))] i += 1 print df err = spark.createDataFrame(df).rdd.map(lambda x: (x[0], int(x[1]))).collect()
## Notice the differences between the uncorrelated(PCA uniform, PCA gaussian2) ## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while ## uncorrelated Uniform needs a rotation to get there. By removing correlation ## in the gaussian case, we have achieved independence between variables. ## If the source variables are gaussian ICA is not required and PCA is sufficient. # Code for PCA and whitening the dataset. from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix from pyspark.mllib.feature import StandardScaler from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix from sklearn import datasets # create the standardizer model for standardizing the dataset X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) ) scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy()
movie_factors = cvModel.bestModel.itemFactors print movie_factors movie_factors.show() movie_factors.registerTempTable('movie_factors') midDF = sqlContext.sql(""" SELECT id, features FROM movie_factors """) midRDD = midDF.rdd #midRDD.collect() vectorRDD = midRDD.map( lambda (x, y): Row(id=x, features=Vectors.dense(y))).cache() vectorRDD.collect() kmeans_input = sqlContext.createDataFrame(vectorRDD).cache() kmeans = KMeans(featuresCol="features", predictionCol="prediction").setK(50) kmeans_df = kmeans.fit(kmeans_input) kmeans_transformed = kmeans_df.transform(kmeans_input) kmeans_transformed.show() kmeans_transformed.registerTempTable('kmeans_table') movie_items = sc.textFile("u.item") movienameRDD = movie_items.map(lambda x: x.split('|')).map( lambda p: Row(movieId=int(p[0]), movieName=p[1])) movienamesDF = sqlContext.createDataFrame(movienameRDD).cache()
#Vector assembler fAssembler = VectorAssembler( inputCols=["C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector", "C21Vector", "i_app_category_Vector", "i_device_type_Vector", "i_site_category_Vector"], outputCol="features") #pipeline to sum up all the stringIndexers and OneHotEncoders and VectorAssemebler data_P = Pipeline(stages=[c1I, c15I, c16I, c18I, c19I, c21I, appcatI, devtypeI, sitecatI, c1E, c15E, c16E, c18E, c19E, c21E, appcatE, devtypeE, sitecatE, fAssembler]) model = data_P.fit(df) data_t = model.transform(df) ###### Part 1 ends here ##### # Making the labelpoints to train the data with LR parsedData=data_t.select('click', 'features').rdd.map(lambda row: LabeledPoint(float(row.click),Vectors.dense((row.features).toArray()))) # split the dataset training,test = parsedData.randomSplit([0.6, 0.4], seed=11L) training.cache() # Train the data using a version of logistic regression that optimizes the parameters with Stochastic Gradient Descent(SGD) model = LogisticRegressionWithSGD.train(training, step=0.1, miniBatchFraction=0.1, regType=None) ##### PART 3 ###### # Using the stochastic gradient descent solution # Test the model using the test data - Getting the Accuracy , FPR and AU - ROC # 1- Accuracy labelsAndPreds = test.map(lambda p: (float(model.predict(p.features)), p.label))
def parse_line(line): parts = line.split(',') label = float(parts[-1]) features = Vectors.dense([float(x) for x in parts[0:-1]]) return LabeledPoint(label,features)
.filter(lambda year: year[17] in ['2015', '2014', '2013', '2012', '2011'])\ .map(lambda x: ((x[2][0:2] + x[2][5:10]), x[10])) # identify all beats beats = lines.map(lambda x: x[1])\ .distinct().collect() # key = beats, values = list of crime month/year unfilled = lines.reduceByKey(lambda x, y: x + "," + y)\ .map(lambda x: (x[0], x[1].split(","))) # count number of crimes per day per beat, fill no-crime values with zero filled = unfilled.map(lambda x: (x[0], fill(x[1], beats))) # convert to vectors vectors = filled.map(lambda x: Vectors.dense(x[1])) # calculate correlation pearsonCorr = Statistics.corr(vectors) # identify top 30 correlated beats pearsonCorr = pd.DataFrame(pearsonCorr, index = beats, columns = beats) unstacked = pearsonCorr.unstack() unstacked = pd.DataFrame(unstacked).reset_index() unstacked.columns = ["beat1", "beat2", "correlation"] unstacked = unstacked[unstacked.beat1 != unstacked.beat2] final = unstacked.nlargest(300, "correlation") # write final to csv final.to_csv("greenwood_2b.csv", index=False)
def to_sparse(v): values = {i: e for i,e in enumerate(v) if e != 0} return Vectors.sparse(v.size, values)
from pyspark.ml.regression import RandomForestRegressor from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext, SparkConf from pyspark.sql.session import SparkSession from pyspark.ml.classification import RandomForestClassifier from pyspark.mllib.tree import RandomForestModel from pyspark.mllib.tree import RandomForest from pyspark.mllib.evaluation import MulticlassMetrics from prettytable import PrettyTable sc = SparkContext() spark = SparkSession(sc) inputDF = spark.read.csv('s3://assignmentcs643/TrainingDataset.csv',header='true', inferSchema='true', sep=';') datadf= inputDF.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) model = RandomForestModel.load(sc,"s3://assignmentcs643/randomforestmodel.model") predictions = model.predict(datadf.map(lambda x: x.features)) labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions) acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(datadf.count()) metrics = MulticlassMetrics(labels_and_predictions) f1 = metrics.fMeasure() recall = metrics.recall() precision = metrics.precision() #evaluation values print("Model accuracy: %.3f%%" % (acc * 100))
if __name__ == "__main__": if len(sys.argv) != 3: print( "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>", file=sys.stderr) exit(-1) #convert and process raw input to (bookid, [features]) def processFeatures(raw): features_str = raw.split() book_id = int(features_str[0]) features = [] for i in range(1, len(features_str)): features.append(float(features_str[i])) return (book_id, features) sc = SparkContext(appName="BookRecSystem") spark = SQLContext(sc) featureRdd = sc.textFile(sys.argv[1]) featureRdd = featureRdd.map(processFeatures) labels = featureRdd.map(lambda x: x[0]) #label_rdd fvecs = featureRdd.map(lambda x: Vectors.dense(x[1])) #feature_rdd data = labels.zip(fvecs) mat = IndexedRowMatrix(data).toBlockMatrix( ) #convert to block-matrix for pairwise cosine similarity dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map( lambda x: (x.index, x.vector.toArray())).sortByKey().map( lambda x: str(x[0]) + ' '.join(map(str, x[1])) ) #pairwise_cosine_similarity to rdd dot.saveAsTextFile(sys.argv[2]) #save output sc.stop()
#creation of model using mllib from pyspark.mllib.linalg import Vectors from pyspark.ml.regression import RandomForestRegressor from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext, SparkConf from pyspark.sql.session import SparkSession from pyspark.ml.classification import RandomForestClassifier from pyspark.mllib.tree import RandomForest spark_session = SparkSession.builder.appName('wine_model').getOrCreate() file1 = spark_session.read.csv('s3://cloud-proj2/TrainingDataset.csv',header='true', inferSchema='true', sep=';') select_col = [c for c in file1.columns if c != 'quality'] data_set= file1.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) # model = LogisticRegression.trainClassifier(transformed_df,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33) # LogisticRegression.trainClassifier() # LogisticRegression() # .setMaxIter(10) # .setRegParam(0.3) # .setElasticNetParam(0.8) # .setFamily("multinomial") model = RandomForest.trainClassifier(data_set,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33) model.save(spark_session.sparkContext,"s3://cloud-proj2/model_created.model")
rating=temp['rating'])) cats = (set( pd.read_csv('yelp_dataset/cat100.csv', squeeze=True).unique()) - regions - {'Food', 'Restaurants'}) v = v[v['categories'].isin(cats)] le = LabelEncoder() v['categories'] = le.fit_transform(v['categories']) v2 = v.groupby(level=0).apply( lambda g: {x: y for x, y in zip(g['categories'], g['rating'])}) rdd = sc.parallelize( v2.tolist()).map(lambda x: Vectors.sparse(len(cats), x)) rdd.cache() mat = RowMatrix(rdd) svd = mat.computeSVD(len(regions), computeU=True) U = svd.U # The U factor is a RowMatrix. s = svd.s # The singular values are stored in a local dense vector. V = svd.V # The V factor is a local dense matrix. vectors = V.toArray() cat_df = pd.DataFrame( {'category': le.inverse_transform(np.arange(vectors.shape[0]))}) cluster = AgglomerativeClustering(n_clusters=len(regions), affinity='cosine', linkage='complete') cat_df = cat_df.assign(cat34_label=cluster.fit_predict( vectors)).set_index('category').cat34_label
# -*- coding: utf-8 -*- from pyspark import SparkContext from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors from pyspark.sql import SQLContext, Row sc = SparkContext() # input file is a term-document matrix, which is generated by make_tdm.py data = sc.textFile( "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv" ) header = data.first() #extract header data = data.filter(lambda x: x != header) data = data.map( lambda line: Vectors.dense([float(x) for x in line.strip().split(',')])) # Index documents with unique IDs corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=30) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() # for topic in range(3): # print("Topic " + str(topic) + ":") # for word in range(0, ldaModel.vocabSize()): # print(" " + str(topics[word])) import numpy
# located = remapped.map(lambda (d, h, l): (locate(l, \ spatial.KDTree(array( \ [[37.7816834,-122.3887657],\ [37.7469112,-122.4821759],\ [37.7411022,-120.804151],\ [37.4834543,-122.3187302],\ [37.7576436,-122.3916382],\ [37.7970013,-122.4140409],\ [37.748496,-122.4567461],\ [37.7288155,-122.4210133],\ [37.5839487,-121.9499339],\ [37.7157156,-122.4145311],\ [37.7329613,-122.5051491],\ [37.7575891,-122.3923824],\ [37.7521169,-122.4497687]])), ["SF18", "SF04", "SF15", "SF17", "SF36", "SF37",\ "SF07", "SF11", "SF12", "SF14", "SF16", "SF19", "SF34"] ),d,h)) counted = located.map(lambda (l, d, h): ((l, d, h), 1)) incidentsreduced = counted.reduceByKey(lambda a, b: a + b) joined = windaveraged.join(incidentsreduced) from pyspark.mllib.linalg import Vectors from pyspark.mllib.stat import Statistics vecs = joined.map(lambda ((s, d, h), ((t, w), i)): Vectors.dense([t, w, i])) print(Statistics.corr(vecs))
print( "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################" ) print("Start Creating Customer Preferences Block Matrix") print( "###################################################################################################" ) index_ct = customer_persona.drop("analytic_id") index_anaId = customer_persona.select("id", "analytic_id") index_ct.registerTempTable("index_ct") ontop_pref_price = ontop_preferences.select("id", "Price_XS", "Price_S", "Price_M", "Price_L", "Price_XL") ontop_pref_price = ontop_pref_price.orderBy(asc("id")) bmB_1 = IndexedRowMatrix( ontop_pref_price.rdd.map(lambda x: IndexedRow(x[0], Vectors.dense(x[ 1:])))).toBlockMatrix(rowsPerBlock=222) count = customer_persona.count() print( "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################" ) print("Finished Creating Customer Preferences Block Matrix") print( "###################################################################################################" ) loop = int(count / 200000) startId = 1 i = 0 res = index_ct del customer_persona print( "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
from pyspark import SparkConf, SparkContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import NaiveBayesModel, NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.rdd import RDD conf = SparkConf().setAppName("myApp").setMaster("local") sc = SparkContext(conf=conf) vMale = Vectors.dense(1, 0, 1, 0, 1, 0) length = 6 index = [0, 1, 2, 3, 5] values = [1, 1, 1, 1, 1] vFemale = Vectors.sparse(length, index, values) train_one = LabeledPoint(1.0, vMale) train_two = LabeledPoint(2.0, vFemale) train_three = LabeledPoint(2.0, Vectors.dense(0, 1, 1, 1, 0, 1)) trains = list() trains.append(train_one) trains.append(train_two) trains.append(train_three) trainingRDD = sc.parallelize(trains) nb = NaiveBayes() nb_model = NaiveBayes.train(trainingRDD) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") dTest = [0, 1, 1, 0, 0, 1]