Python Vectors示例，pyspark.mllib.linalg.Vectors Python示例

示例#1

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)

示例#2

0

显示文件

文件： test_linalg.py 项目： drewrobb/spark

 def test_equals(self):
     indices = [1, 2, 4]
     values = [1., 3., 2.]
     self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))

示例#3

0

显示文件

文件： tests.py 项目： Bella-Lin/spark

 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)

示例#4

0

显示文件

文件： tests.py 项目： Bella-Lin/spark

 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     cvModel = cv.fit(dataset)
     cvPath = temp_path + "/cv"
     cv.save(cvPath)
     loadedCV = CrossValidator.load(cvPath)
     self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
     self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
     self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
     cvModelPath = temp_path + "/cvModel"
     cvModel.save(cvModelPath)
     loadedModel = CrossValidatorModel.load(cvModelPath)
     self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)

示例#5

0

显示文件

文件： tests.py 项目： Bella-Lin/spark

 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvsModel = tvs.fit(dataset)
     tvsPath = temp_path + "/tvs"
     tvs.save(tvsPath)
     loadedTvs = TrainValidationSplit.load(tvsPath)
     self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
     self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
     self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
     tvsModelPath = temp_path + "/tvsModel"
     tvsModel.save(tvsModelPath)
     loadedModel = TrainValidationSplitModel.load(tvsModelPath)
     self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)

示例#6

0

显示文件

文件： test_util.py 项目： drewrobb/spark

 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)

示例#7

0

显示文件

文件： test_nn_classifier.py 项目： ru003ar/analytics-zoo

    def test_nnclassifier_in_pipeline(self):

        if self.sc.version.startswith("1"):
            from pyspark.mllib.linalg import Vectors

            df = self.sqlContext.createDataFrame(
                [(Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 (Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 ], ["features", "label"])

            scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
            model = Sequential().add(Linear(2, 2))
            criterion = ClassNLLCriterion()
            classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
                .setBatchSize(4) \
                .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")

            pipeline = Pipeline(stages=[scaler, classifier])

            pipelineModel = pipeline.fit(df)

            res = pipelineModel.transform(df)
            assert type(res).__name__ == 'DataFrame'

示例#8

0

显示文件

文件： tests.py 项目： bsangee/spark

 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass

示例#9

0

显示文件

文件： tests.py 项目： HodaAlemi/spark

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))

示例#10

0

显示文件

文件： tests.py 项目： greatyan/spark

 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])

示例#11

0

显示文件

文件： tests.py 项目： HodaAlemi/spark

 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)

示例#12

0

显示文件

文件： pipelines.py 项目： ngarneau/sentiment-analysis

 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])

示例#13

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])

示例#14

0

显示文件

文件： tests.py 项目： HodaAlemi/spark

 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)

示例#15

0

显示文件

文件： ml_pipeline_otto.py 项目： thisiskofi/elephas

def load_data_rdd(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile(data_path + csv_file)
    data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
            str(line[-1]).replace('Class_', '')) )
    else:
        data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") )
    return data

示例#16

0

显示文件

文件： consume_profiles_spark_2.py 项目： Froskekongen/content-consumption

def parseEntry(xx):

    mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
    xx=xx.split('\t')
    a_virtual=xx[0]
    browser=xx[1]
    referrer=xx[2]
    a_user_key=xx[3]
    try:
        birthyear=int(xx[4])
        age=2015-birthyear
    except Exception as _:
        birthyear=xx[4]
        age=-1
    gender=xx[5]
    #print(xx)
    #print(xx[6])
    if xx[6]!='NAN':
        reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
    else:
        reg_date=mindate
    device=xx[7]
    date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
    tdiff=datetime.timedelta(hours=int(xx[9]))
    date=date+tdiff
    year=date.year
    month=date.month
    day=date.day
    hour=int(xx[9])
    weekday=date.weekday()

    if reg_date>mindate:
        days_since_registration=(date-reg_date).days
    else:
        days_since_registration=-1

    metrics=list([int(x.replace(',0','')) for x in xx[10:]])
    visits=metrics[0]
    visits_betalt=metrics[1]
    pageviews=metrics[2]
    pageview_nothome=metrics[3]
    pageview_betalt=metrics[4]

    timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
    timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])

    return Row(browser=browser,a_user_key=a_user_key,age=age,\
               day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
               pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
               device=device,gender=gender,days_since_registration=days_since_registration,\
               reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
               a_virtual=a_virtual)

示例#17

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")

示例#18

0

显示文件

文件： spark101.py 项目： ChienHsiung/python

def load_data_frame(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
                          'class_'+str(line[0]),int(line[0])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) 
    return sqlcontext.createDataFrame(data, ['features', 'category','label'])

示例#19

0

显示文件

文件： modeling_utils.py 项目： USF-ML2/SKYNET-

def create_rows_for_rdd(x):
    """

    :param x:
    :return:
    """
    features = list(x[1])
    l = len(features) - 1
    label = float(features.pop(l))
    meta_data = x[0]
    return Row(label=label,
               features=Vectors.dense(features),
               meta_data=Vectors.dense(meta_data))

示例#20

0

显示文件

文件： _model.py 项目： BabelTower/spark-timeseries

 def remove_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a Numpy array
     
     returns the time series with removed time-dependent effects as a Numpy array
     """
     destts = Vectors.dense(np.array([0] * len(ts)))
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())

示例#21

0

显示文件

文件： _model.py 项目： BabelTower/spark-timeseries

 def add_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply a model to it.
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a Numpy array
     
     returns the time series with added time-dependent effects as a Numpy array.
     """
     destts = Vectors.dense([0] * len(ts))
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())

示例#22

0

显示文件

文件： converter_test.py 项目： Anhmike/spark-sklearn

 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)

示例#23

0

显示文件

文件： run_v2.py 项目： lijiahong/spark_clustering

def add_svec(sv1, sv2):
    assert len(sv1) == len(sv2), "dimension mismatch"
    indices = []
    values = []
    i, j = 0, 0
    while i < len(sv1.indices) and j < len(sv2.indices):
        if sv1.indices[i] == sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i] + sv2.values[j])
            i += 1
            j += 1
        elif sv1.indices[i] < sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i])
            i += 1
        else:
            indices.append(sv2.indices[j])
            values.append(sv2.values[j])
            j += 1
    while i < len(sv1.indices):
        indices.append(sv1.indices[i])
        values.append(sv1.values[i])
        i += 1
    while j < len(sv2.indices):
        indices.append(sv2.indices[j])
        values.append(sv2.values[j])
        j += 1
    return Vectors.sparse(len(sv1), indices, values)

示例#24

0

显示文件

文件： learnITQ.py 项目： rohitgirdhar/BigITQ

def save_pca_parameters(pca_model, data_dim):
    # since there's no good way of doing it in python, simply use an I matrix to retrieve
    features = [(Vectors.dense(x),) for x in np.eye(data_dim).tolist()]
    params = pca_embed(sqlContext.createDataFrame(features, ('features',)), pca_model)
    np.savetxt(PCA_OUT_PATH,
               np.matrix(params.select('pca').rdd.map(lambda r: r[0]).collect()),
               fmt='%.6f')

示例#25

0

显示文件

文件： ARIMA.py 项目： pegli/spark-timeseries

 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms. Numpy array.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)

示例#26

0

显示文件

文件： adapter.py 项目： ZhangAustin/elephas

def to_vector(np_array):
    ''' Convert numpy array to MLlib Vector '''
    if len(np_array.shape) == 1:
        return Vectors.dense(np_array)
    else:
        raise Exception("""An MLLib Vector can only be created
                        from a one-dimensional numpy array""")

示例#27

0

显示文件

文件： __init__.py 项目： gadamc/simple-data-pipe-connector-flightstats

def buildLabeledPoint(s, classification):
    features=[]
    for attr in attributes:
        features.append(getattr(s, attr + '_1'))
    for attr in attributes:
        features.append(getattr(s, attr + '_2'))
    return LabeledPoint(classification,Vectors.dense(features))

示例#28

0

显示文件

文件： Qn8.py 项目： shaileshr/SentimentAnalysis

def createSparseVector(histogram):
	indexList = []
	countList = []
	for histogramIndex, count in sorted(histogram, key=getKey):
		indexList.append(histogramIndex)
		countList.append(count)
	return Vectors.sparse(2000, indexList,countList)

示例#29

0

显示文件

文件： GMMclustering.py 项目： FlytxtRnD/GMM

    def scoreOnePoint(self, x):

        """
        Compute the log likelihood of 'x' being generated under the current model
        Also returns the probability that 'x' is generated by each component of the mixture

        Parameters
        ----------
        x : array of shape (1,  n_dim)
            Corresponds to a single data point.

        Returns
        -------
        log_likelihood_x :Log likelihood  of 'x'
        prob_x : Resposibility  of each cluster for the data point 'x'

        """
        lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights))
        log_likelihood_x = logsumexp(lpr)
        prob_x = np.exp(lpr-log_likelihood_x)

        if self.isSparse == 1:
            temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :])
            sqVec = Vectors.sparse(x.size, x.indices, x.values**2)
            temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :])

        else:
            temp_wt = np.dot(prob_x.T[:, np.newaxis],  x[np.newaxis, :])
            temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :])

        return log_likelihood_x, prob_x, temp_wt, temp_avg

示例#30

0

显示文件

文件： tf_idf.py 项目： lijiahong/spark_clustering

def load_cut_to_rdd(input_file, result_file):
    sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
    terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
    num_term = len(terms_list)

    term_idf = doc_term_tf.map(
            lambda ((tid, term), tf): (term, 1.0)
            ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
    tfidf_join = doc_term_tf.map(
            lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()

    nonzero_count = 0
    f = open(result_file,'w')
    f.write('%s %s\r\n'%(num_doc, num_term))
    for (tid, feature) in doc_vec.collect():
        for num in feature:
            f.write(str(num)+"\t")
        f.write("\n")
    f.close()
    sc.stop()


    return

示例#31

0

显示文件

    decisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="rmse")
    rmse = decisionTree_model_evaluator.evaluate(
        decisionTree_model_predictions)
    print(
        "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" %
        rmse)
    r2_dt = ecisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="r2")
    print("R Squared (R2) for Decision Tree on test data = %g" %
          r2_dt.evaluate(decisionTree_model_predictions))

    ############################---RANDOM FOREST REGRESSION---##################################

    train_rdd_rf = train_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
    test_rdd_rf = test_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

    RandomForest_model = RandomForest.trainRegressor(
        train_rdd_rf,
        categoricalFeaturesInfo={},
        numTrees=50,
        featureSubsetStrategy="auto",
        maxDepth=10,
        maxBins=100)

    predictions = RandomForest_model.predict(
        test_rdd_rf.map(lambda x: x.features))
    labelsAndPredictions = test_rdd_rf.map(lambda lp: lp.label).zip(
        predictions)

示例#32

0

显示文件

def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(')')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))

    return LabeledPoint(label, vec)

示例#33

0

显示文件

import sys
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, sys.argv[1])
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)

    sc.stop()

示例#34

0

显示文件

df_train.write.options(
    header="true").csv("hdfs://node1:9000/user/root/exp4/procd_train_real.csv")
df_train.write.parquet(
    "hdfs://node1:9000/user/root/exp4/procd_train_real.parquet")

# %%
#填充缺失值
#第一种策略是将后8个特征所有null值填充为0
df_train_filled = df_train.fillna(0)
df_train_filled.show()

# %%
#将数据转为合适的格式
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
#先转成RDD
df_train_rdd = df_train_filled.rdd
#改成(label,features)的格式
df_train_rdd = df_train_rdd.map(
    lambda line: LabeledPoint(line[2], Vectors.dense(line[3:])))

# %%
#保存为LibSVMFile格式，方便后面训练使用
from pyspark.mllib.util import MLUtils
MLUtils.saveAsLibSVMFile(df_train_rdd,
                         "hdfs://node1:9000/user/root/exp4/procd_train_real")

# %%
#别忘了关掉session
spark.stop()

示例#35

0

显示文件

from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
from pyspark.mllib.linalg import Vectors, VectorUDT

# Take the first two values from a SparseVector and convert them to a DenseVector
firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT())

irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache()
display(irisTwoFeatures)

# COMMAND ----------

# TEST
Test.assertEquals(str(irisTwoFeatures.first()), 'Row(features=DenseVector([-0.5556, 0.25]), label=0.0)',
                  'incorrect definition of firstTwoFeatures')

示例#36

0

显示文件

# See the xyz coordinates of each atom in the file
t.xyz

# Find the current shape of the data
t.xyz.shape

# Get the first 1000 frames of xyz data
t_1k = t.xyz[0:1000]

# Convert into spark RDD to run PCA using ML
data = []
# try to find a way to optimize the vectorization
from pyspark.mllib.linalg import Vectors
for frame in t_1k:
  for atom in frame:
    data.append((Vectors.dense(atom),))

# Next, apply PCA with the following:
from pyspark.ml.feature import PCA
df = sqlContext.createDataFrame(data, ["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(df)
model.transform(df).collect()[0].pca_features


data = [(Vectors.dense([1.0, 0.0]),), (Vectors.dense([0.0, -1.0]),)]


### NEW PCA MODEL TO GET COMPONENTS AND EIGENVALUES
import numpy as np

示例#37

0

显示文件

    lambda x: x).distinct().collect()
featur_index = {v: index for index, v in enumerate(featurs, 1)}
featur_index_value = sc.broadcast(featur_index).value

chi_index_map = {v: index for index, v in enumerate(chi_index, 1)}
chi_index_value = sc.broadcast(chi_index_map).value

rdd.map(lambda x: x.label + ' ' + get_feature_index(
    x.feature, featur_index_value)).saveAsTextFile('/user/zlj/tmp/cat3_libsvm')
rdd.map(lambda x: x.tel + ' ' + get_feature_index(
    x.feature, featur_index_value)).saveAsTextFile(
        '/user/zlj/tmp/cat3_libsvm_tel')

lp=rdd.map(lambda x:x.label+' '+get_feature_index(x.feature,featur_index_value))\
    .map(lambda x:MLUtils._parse_libsvm_line(x))\
    .map(lambda x:LabeledPoint(x[0],Vectors.sparse(40000, x[1], x[2])))

model = ChiSqSelector(100).fit(lp)

lp.map(lambda x: (x[0], model.transform(x[1])))
model.transform(lp)

sc.parallelize(
    sc.textFile('/user/zlj/tmp/cat3_libsvm/part-00092').take(30)
    [0]).saveAsTextFile('/user/zlj/tmp/test1')

values = MLUtils._parse_libsvm_line(t1.take(20)[3])[1]


def check(value):
    size = len(value)

示例#38

0

显示文件

文件： GMMclustering.py 项目： jianqiaoc/GMM

    def fit(self, data, n_components, n_iter, ct):
        """
        Estimate model parameters with the expectation-maximization
        algorithm.

        Parameters
        ----------
        data - RDD of data points
        n_components - Number of components
        n_iter - Number of iterations. Default to 100

        Attributes
        ----------

        covariance_type : Type of covariance matrix.
            Supports only diagonal covariance matrix.

        ct : Threshold value to check the convergence criteria.
            Defaults to 1e-3

        min_covar : Floor on the diagonal of the covariance matrix to prevent
            overfitting.  Defaults to 1e-3.

        converged : True once converged False otherwise.

        Weights : array of shape (1,  n_components)
            weights for each mixture component.

        Means : array of shape (n_components, n_dim)
            Mean parameters for each mixture component.

        Covars : array of shape (n_components, n_dim)
            Covariance parameters for each mixture component

        """
        sc = data.context
        covariance_type = 'diag'
        converged = False
        self.min_covar = 1e-3

        #  observation statistics
        self.s0 = 0
        self.s1 = 0
        #  To get the no of data points
        n_points = data.count()
        #  To get the no of dimensions
        n_dim = data.first().size

        if (n_points == 0):
            raise ValueError('Dataset cannot be empty')
        if (n_points < n_components):
            raise ValueError(
                'Not possible to make (%s) components from (%s) datapoints' %
                (n_components, n_points))

        # Initialize Covars(diagonal covariance matrix)
        if hasattr(data.first(), 'indices'):
            self.isSparse = 1

            def convert_to_kvPair(eachV):
                g = []
                for i in range(eachV.indices.size):
                    g.append(
                        (eachV.indices[i],
                         (eachV.values[i], eachV.values[i] * eachV.values[i])))
                return g

            def computeVariance(x):
                mean = x[1][0] / n_points
                sumSq = x[1][1] / n_points
                return x[0], sumSq - mean * mean

            cov = []
            kvPair = data.flatMap(convert_to_kvPair)
            res = kvPair.reduceByKey(np.add).map(computeVariance)
            cov = Vectors.sparse(n_dim, res.collectAsMap()).toArray() + 1e-3
            self.Covars = np.tile(cov, (n_components, 1))

        else:
            self.isSparse = 0
            cov = []
            for i in range(n_dim):
                cov.append(
                    data.map(lambda m: m[i]).variance() + self.min_covar)
            self.Covars = np.tile(cov, (n_components, 1))

        # Initialize Means using MLlib KMeans
        self.Means = np.array(KMeans().train(data,
                                             n_components).clusterCenters)
        # Initialize Weights with the value 1/n_components for each component
        self.Weights = np.tile(1.0 / n_components, n_components)
        #  EM algorithm
        # loop until number of iterations  or convergence criteria is satisfied
        for i in range(n_iter):

            logging.info("GMM running iteration %s " % i)
            # broadcasting means,covars and weights
            self.meansBc = sc.broadcast(self.Means)
            self.covarBc = sc.broadcast(self.Covars)
            self.weightBc = sc.broadcast(self.Weights)
            # Expectation Step
            EstepOut = data.map(self.scoreOnePoint)
            # Maximization step
            MstepIn = EstepOut.reduce(lambda (w1, x1, y1, z1), (
                w2, x2, y2, z2): (w1 + w2, x1 + x2, y1 + y2, z1 + z2))
            self.s0 = self.s1
            self.mStep(MstepIn[0], MstepIn[1], MstepIn[2], MstepIn[3])

            #  Check for convergence.
            if i > 0 and abs(self.s1 - self.s0) < ct:
                converged = True
                logging.info("Converged at iteration %s" % i)
                break

        return self

示例#39

0

显示文件

def load_cut_to_rdd(input_file, result_file, cluster_num=CLUSTER_NUM, clu_iter=CLUSTERING_ITER,\
        ini_iter=INITIAL_ITER, rb_iter=RB_ITER, con_dist=convergeDist, filter_scale=FILTER_SCALE):
    sc = SparkContext(appName='PythonKMeans',
                      master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()

    initial_term_idf = doc_term_tf.map(lambda ((tid, term), tf):
                                       (term, 1.0)).reduceByKey(add)
    # filter
    initial_num_term = initial_term_idf.count()
    print 'initial_num_term', initial_num_term
    idf_sum = initial_term_idf.values().sum()
    print 'idf_sum', idf_sum

    idf_average = idf_sum / (initial_num_term * filter_scale)
    term_idf = initial_term_idf.filter(
        lambda (term, idf): idf_average < idf <
        (idf_average * (filter_scale - 1))).mapValues(
            lambda idf: math.log(float(num_doc) / (idf + 1)))
    terms_list = term_idf.keys().collect()
    num_term = len(terms_list)
    print 'num_term', num_term

    tfidf_join = doc_term_tf.map(lambda ((tid, term), tf):
                                 (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)):
                           (tid, (terms_list.index(term), tf * idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature: csr_matrix(
        Vectors.sparse(num_term, feature).toArray())).cache()
    global_center = doc_vec.mapValues(lambda x: x / num_doc).values().reduce(
        add)
    g_length = vector_length(global_center)

    # initial 2-way clustering
    maximum_total_variance = 0
    best_kPoints = []
    print 'initial', now()
    for i in range(ini_iter):
        kPoints, tempDist, iter_count = clustering(doc_vec, K, con_dist,
                                                   clu_iter)
        # evaluation
        cluster_variance, total_variance = cluster_evaluation(doc_vec, kPoints)
        ex_value = external_evaluation(kPoints, global_center, g_length)
        obj_value = total_variance[0] / ex_value

        # choose the best initial cluster
        if obj_value > maximum_total_variance:
            maximum_total_variance = obj_value
            best_kPoints = kPoints
    # global_distance = sum(cosine_dist(best_kPoints[x][1], global_center, best_kPoints[x][2], g_length) for x in range(len(best_kPoints)))

    f = open(result_file, 'w')
    f.write(
        str(iter_count) + "\t" + str(num_doc) + "\t" + str(num_term) + "\n")
    for index in range(len(terms_list)):
        f.write(terms_list[index].encode('utf-8') + '\t')
    """
    for (term, ((tid,tf), idf)) in tfidf_join.collect():
        f.write(term.encode('utf-8')+'\t'+str(tid)+'\t'+str(tf)+'\t'+str(idf)+'\n')
    print >> f, "%0.9f" % tempDist
    print >> f, "total_variance", total_variance[0], total_variance[1]
    print >> f, "global_dist", global_distance
    f.write("center:"+"\t")
    for dim in global_center:
        f.write(str(dim)+"\t")
    f.write("\n")
    for i in range(len(best_kPoints)):
        f.write(str(i))
        for unit in best_kPoints[i][1]:
            f.write("\t")
            f.write(str(unit))
        f.write("\n")
    for (index, (dist, num)) in cluster_variance.collect():
        f.write(str(index))
        f.write("\t")
        f.write(str(dist))
        f.write("\t")
        f.write(str(num))
        f.write("\n")
    """
    f.close()
    #repeated bisect
    #choose cluster

    updated_dict = {}
    updated_points_dict = {}
    total_delta_variance = 0
    updated_dict[total_delta_variance] = doc_vec
    updated_points_dict[total_delta_variance] = best_kPoints

    print 'repeated', now()
    for j in range(2, cluster_num + 1):
        if not (total_delta_variance in updated_dict):
            print "no cluster to divide"
            break

        print 'cluster to divide', total_delta_variance, updated_dict[
            total_delta_variance]
        best_cluster = updated_dict[total_delta_variance]
        global_best_kPoints = updated_points_dict[total_delta_variance]
        del updated_dict[total_delta_variance]
        del updated_points_dict[total_delta_variance]
        closest = best_cluster.map(lambda (tid, feature): (closestPoint(
            feature, global_best_kPoints), (tid, feature))).cache()
        print 'total_count', closest.count()

        total_delta_variance = float("-inf")  # clear to zero
        for key in updated_dict:
            if key > total_delta_variance:
                total_delta_variance = key

        for i in range(K):
            single_cluster = closest.filter(
                lambda (index, (tid, feature)): index == i).values().cache()
            print 'count', i, single_cluster.count()

            maximum_total_variance = 0
            best_kPoints = []
            in_value = cal_cluster_variance(single_cluster)
            ex_value = cosine_dist(global_best_kPoints[i][1], global_center,
                                   global_best_kPoints[i][2], g_length)
            initial_distance = in_value / ex_value
            for j in range(rb_iter):
                # clustering
                kPoints, tempDist, iter_count = clustering(
                    single_cluster, K, con_dist, clu_iter)
                # evaluation
                cluster_variance, total_variance = cluster_evaluation(
                    single_cluster, kPoints)
                ex_value = external_evaluation(kPoints, global_center,
                                               g_length)
                obj_value = total_variance[0] / ex_value

                if obj_value > maximum_total_variance:
                    maximum_total_variance = obj_value
                    best_kPoints = kPoints

            improvement = maximum_total_variance - initial_distance
            updated_dict[improvement] = single_cluster  # update dict
            updated_points_dict[improvement] = best_kPoints
            print 'improvement', improvement, maximum_total_variance, initial_distance

            if improvement > total_delta_variance:
                total_delta_variance = improvement
                print 'length', cluster_variance.count()

    count = 0
    for key in updated_dict:
        count += 1
        print 'key', key
        per_cluster = updated_dict[key]

        total_similarity = cal_cluster_variance(per_cluster)
        f = open('results/cluster_' + str(count), 'w')
        print >> f, key, total_similarity

        results_list = per_cluster.values().reduce(add).toarray()
        for row in results_list:
            for index in range(len(row)):
                value = row[index]
                if value != 0:
                    f.write('(' + str(index) + ',' + str(value) + ')\t')
        f.write('\n')
        for (tid, feature) in per_cluster.collect():
            f.write(tid)
            """
            for row in feature.toarray():
                for unit in range(len(row)):
                    f.write('\t')
                    f.write(str(row[unit]))
            """
            f.write('\n')
        f.close()

    sc.stop()
    return

示例#40

0

显示文件

文件： polynomial_expansion_example.py 项目： kmrhadoop/GitHub-Spark

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PolynomialExpansionExample")\
        .getOrCreate()

    # $example on$
    df = spark\
        .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                          (Vectors.dense([0.0, 0.0]),),
                          (Vectors.dense([0.6, -1.1]),)],
                         ["features"])
    px = PolynomialExpansion(degree=2,
                             inputCol="features",
                             outputCol="polyFeatures")
    polyDF = px.transform(df)
    for expanded in polyDF.select("polyFeatures").take(3):
        print(expanded)
    # $example off$

    spark.stop()

示例#41

0

显示文件

        for i in range(1, k):
            if 'f:'+str(i) in line:
                indexList.append(i)
                valList.append(line['f:'+str(i)])
        label = int(line['l:'+str(col)])
        if label == -1:
            label = 0
        features.append((Vectors.sparse(k, indexList, valList),label))
    features = sc.parallelize(features)
    #sclines = sc.parallelize(lines)
    #features = sclines.map(featuresToSparseVecFromLine)
    featureDataFrame = spark.createDataFrame(features, ["features", "label"])
    pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(featureDataFrame)
    #pcaresult = model.transform(featureDataFrame).select("pcaFeatures").collect()
    #lp = []
    #c = 0
    #for com in pcaresult:
    #    lp.append(LabeledPoint(lines[c]['l:' + str(col)], mllibVectors.fromML(com.pcaFeatures)))
    #    c += 1
    #lp = sc.parallelize(lp)
    pcaresult = model.transform(featureDataFrame).rdd
    lp = pcaresult.map(lambda r: LabeledPoint(r.label, mllibVectors.fromML(r.pcaFeatures)))
    model = SVMWithSGD.train(lp)
    model.save(sc, "svm/SVM" + str(col))
    labelsAndPreds = lp.map(lambda p: (p.label, model.predict(p.features)))
    err = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("err at node " + str(col) + " = " + str(err))

sc.stop()

示例#42

0

显示文件

        wordsFiltered.append(w)

txt = " ".join(wordsFiltered).lower()

data = sc.parallelize([
    txt
]).zipWithIndex().map(lambda val: Row(idd=val[1], words=val[0].split(" ")))

docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select(
    "idd",
    "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3, maxIterations=700, optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 5  # number of words per topic
topicIndices = sc.parallelize(
    ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))


def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):

示例#43

0

显示文件

sc = SparkContext(conf=conf)

#
row_data = sc.textFile(
    "/user-program/python/MachineLearningSpark/Data/ml-100k/u.data")
row_ratings = row_data.map(lambda line: line.split('\t')).map(
    lambda r: Rating(int(r[0]), int(r[1]), float(r[2])))
print(row_ratings.first())

#
row_ratings.cache()

#
als_model = ALS.train(row_ratings, 50, 10, 0.1)
movie_factors = als_model.productFeatures().map(lambda (id, factor):
                                                (id, Vectors.dense(factor)))
movie_vectors = movie_factors.map(lambda (id, vector): vector)
#print(movie_vectors.first())
user_factors = als_model.userFeatures().map(lambda (id, factor):
                                            (id, Vectors.dense(factor)))
user_vectors = user_factors.map((lambda (id, vector): vector))
#print(user_vectors.first())

# train
movie_cluster_model = KMeans().train(movie_vectors,
                                     k=5,
                                     maxIterations=10,
                                     runs=3)
print("movie cluster model kmeans :")
print(movie_cluster_model)
user_cluster_model = KMeans().train(user_vectors,

示例#44

0

显示文件

 def parseTrainingData(line):
     cell = line.split(",")
     return Vectors.dense([float(cell[0]), float(cell[1])])

示例#45

0

显示文件

文件： regression.py 项目： ANUJA1/spark-tutorial-cn

 def __str__(self):
     return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"

示例#46

0

显示文件

    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
# Getting the input data
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))

# Initialize GMM
start = timer()
gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018)
end = timer()
print(end - start)
df = pandas.DataFrame({'features': [], 'cluster': []})
i = 0
for v in vector_df.collect():
    df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))]
    i += 1

print df

err = spark.createDataFrame(df).rdd.map(lambda x: (x[0], int(x[1]))).collect()

示例#47

0

显示文件

## Notice the differences between the uncorrelated(PCA uniform, PCA gaussian2)
## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while 
## uncorrelated Uniform needs a rotation to get there. By removing correlation
## in the gaussian case, we have achieved independence between variables.
## If the source variables are gaussian ICA is not required and PCA is sufficient.
    
    
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()

示例#48

0

显示文件

文件： als.py 项目： kavanshukla/spark-python

    movie_factors = cvModel.bestModel.itemFactors
    print movie_factors
    movie_factors.show()

    movie_factors.registerTempTable('movie_factors')

    midDF = sqlContext.sql("""
        SELECT id, features
        FROM movie_factors
        """)

    midRDD = midDF.rdd
    #midRDD.collect()
    vectorRDD = midRDD.map(
        lambda (x, y): Row(id=x, features=Vectors.dense(y))).cache()
    vectorRDD.collect()
    kmeans_input = sqlContext.createDataFrame(vectorRDD).cache()
    kmeans = KMeans(featuresCol="features",
                    predictionCol="prediction").setK(50)
    kmeans_df = kmeans.fit(kmeans_input)

    kmeans_transformed = kmeans_df.transform(kmeans_input)
    kmeans_transformed.show()

    kmeans_transformed.registerTempTable('kmeans_table')

    movie_items = sc.textFile("u.item")
    movienameRDD = movie_items.map(lambda x: x.split('|')).map(
        lambda p: Row(movieId=int(p[0]), movieName=p[1]))
    movienamesDF = sqlContext.createDataFrame(movienameRDD).cache()

示例#49

0

显示文件

#Vector assembler
fAssembler = VectorAssembler(
    inputCols=["C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector", "C21Vector", "i_app_category_Vector", "i_device_type_Vector", "i_site_category_Vector"],
    outputCol="features")

#pipeline to sum up all the stringIndexers and OneHotEncoders and VectorAssemebler
data_P = Pipeline(stages=[c1I, c15I, c16I, c18I, c19I, c21I, appcatI, devtypeI, sitecatI, 
	c1E, c15E, c16E, c18E, c19E, c21E, appcatE, devtypeE, sitecatE, fAssembler])

model = data_P.fit(df)
data_t = model.transform(df)

###### Part 1 ends here #####

# Making the labelpoints to train the data with LR
parsedData=data_t.select('click', 'features').rdd.map(lambda row: LabeledPoint(float(row.click),Vectors.dense((row.features).toArray())))

# split the dataset
training,test = parsedData.randomSplit([0.6, 0.4], seed=11L)
training.cache()

# Train the data using a version of logistic regression that optimizes the parameters with Stochastic Gradient Descent(SGD)
model = LogisticRegressionWithSGD.train(training, step=0.1, miniBatchFraction=0.1, regType=None)


##### PART 3 ######
# Using the stochastic gradient descent solution
# Test the model using the test data - Getting the Accuracy , FPR and AU - ROC

# 1- Accuracy
labelsAndPreds = test.map(lambda p: (float(model.predict(p.features)), p.label))

示例#50

0

显示文件

def parse_line(line):
	parts = line.split(',')
	label = float(parts[-1])
	features = Vectors.dense([float(x) for x in parts[0:-1]])
	return LabeledPoint(label,features)

示例#51

0

显示文件

文件： greenwood_2b.py 项目： spalmerg/PySpark-HW1

      .filter(lambda year: year[17] in ['2015', '2014', '2013', '2012', '2011'])\
      .map(lambda x: ((x[2][0:2] + x[2][5:10]), x[10]))
  
  # identify all beats
  beats = lines.map(lambda x: x[1])\
      .distinct().collect()
  
  # key = beats, values = list of crime month/year
  unfilled = lines.reduceByKey(lambda x, y: x + "," + y)\
      .map(lambda x: (x[0], x[1].split(",")))

  # count number of crimes per day per beat, fill no-crime values with zero
  filled = unfilled.map(lambda x: (x[0], fill(x[1], beats)))

  # convert to vectors
  vectors = filled.map(lambda x: Vectors.dense(x[1]))

  # calculate correlation
  pearsonCorr = Statistics.corr(vectors)

  # identify top 30 correlated beats
  pearsonCorr = pd.DataFrame(pearsonCorr, index = beats, columns = beats)
  unstacked = pearsonCorr.unstack()
  unstacked = pd.DataFrame(unstacked).reset_index()
  unstacked.columns = ["beat1", "beat2", "correlation"]
  unstacked = unstacked[unstacked.beat1 != unstacked.beat2]
  final = unstacked.nlargest(300, "correlation")

  # write final to csv
  final.to_csv("greenwood_2b.csv", index=False)

示例#52

0

显示文件

def to_sparse(v):
  values = {i: e for i,e in enumerate(v) if e != 0}
  return Vectors.sparse(v.size, values)

示例#53

0

显示文件

from pyspark.ml.regression import RandomForestRegressor
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession	
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForestModel
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.evaluation import MulticlassMetrics
from prettytable import PrettyTable

sc = SparkContext()
spark = SparkSession(sc)
inputDF = spark.read.csv('s3://assignmentcs643/TrainingDataset.csv',header='true', inferSchema='true', sep=';')


datadf= inputDF.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
model = RandomForestModel.load(sc,"s3://assignmentcs643/randomforestmodel.model")

predictions = model.predict(datadf.map(lambda x: x.features))

labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(datadf.count())


metrics = MulticlassMetrics(labels_and_predictions)
f1 = metrics.fMeasure()
recall = metrics.recall()
precision = metrics.precision()

#evaluation values 
print("Model accuracy: %.3f%%" % (acc * 100))

示例#54

0

显示文件

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>",
            file=sys.stderr)
        exit(-1)
    #convert and process raw input to (bookid, [features])
    def processFeatures(raw):
        features_str = raw.split()
        book_id = int(features_str[0])
        features = []
        for i in range(1, len(features_str)):
            features.append(float(features_str[i]))
        return (book_id, features)

    sc = SparkContext(appName="BookRecSystem")
    spark = SQLContext(sc)
    featureRdd = sc.textFile(sys.argv[1])
    featureRdd = featureRdd.map(processFeatures)
    labels = featureRdd.map(lambda x: x[0])  #label_rdd
    fvecs = featureRdd.map(lambda x: Vectors.dense(x[1]))  #feature_rdd
    data = labels.zip(fvecs)
    mat = IndexedRowMatrix(data).toBlockMatrix(
    )  #convert to block-matrix for pairwise cosine similarity
    dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map(
        lambda x: (x.index, x.vector.toArray())).sortByKey().map(
            lambda x: str(x[0]) + ' '.join(map(str, x[1]))
        )  #pairwise_cosine_similarity to rdd
    dot.saveAsTextFile(sys.argv[2])  #save output
    sc.stop()

示例#55

0

显示文件

文件： wine_modelling.py 项目： Kunj-97/Wine_Testing

#creation of model using mllib 
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import RandomForestRegressor
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession	
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest






spark_session = SparkSession.builder.appName('wine_model').getOrCreate()
file1 = spark_session.read.csv('s3://cloud-proj2/TrainingDataset.csv',header='true', inferSchema='true', sep=';')
select_col = [c for c in file1.columns if c != 'quality']


data_set= file1.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
# model = LogisticRegression.trainClassifier(transformed_df,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33)
# LogisticRegression.trainClassifier()
# LogisticRegression()
#   .setMaxIter(10)
#   .setRegParam(0.3)
#   .setElasticNetParam(0.8)
#   .setFamily("multinomial")
model = RandomForest.trainClassifier(data_set,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33)
model.save(spark_session.sparkContext,"s3://cloud-proj2/model_created.model")

示例#56

0

显示文件

文件： flexiprocess.py 项目： Coldsp33d/Yelp-Challenge-2018

                    rating=temp['rating']))

        cats = (set(
            pd.read_csv('yelp_dataset/cat100.csv', squeeze=True).unique()) -
                regions - {'Food', 'Restaurants'})
        v = v[v['categories'].isin(cats)]

        le = LabelEncoder()
        v['categories'] = le.fit_transform(v['categories'])

        v2 = v.groupby(level=0).apply(
            lambda g: {x: y
                       for x, y in zip(g['categories'], g['rating'])})

        rdd = sc.parallelize(
            v2.tolist()).map(lambda x: Vectors.sparse(len(cats), x))
        rdd.cache()
        mat = RowMatrix(rdd)
        svd = mat.computeSVD(len(regions), computeU=True)
        U = svd.U  # The U factor is a RowMatrix.
        s = svd.s  # The singular values are stored in a local dense vector.
        V = svd.V  # The V factor is a local dense matrix.
        vectors = V.toArray()

        cat_df = pd.DataFrame(
            {'category': le.inverse_transform(np.arange(vectors.shape[0]))})
        cluster = AgglomerativeClustering(n_clusters=len(regions),
                                          affinity='cosine',
                                          linkage='complete')
        cat_df = cat_df.assign(cat34_label=cluster.fit_predict(
            vectors)).set_index('category').cat34_label

示例#57

0

显示文件

# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark.sql import SQLContext, Row
sc = SparkContext()
# input file is a term-document matrix, which is generated by make_tdm.py
data = sc.textFile(
    "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv"
)
header = data.first()  #extract header
data = data.filter(lambda x: x != header)
data = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(',')]))

# Index documents with unique IDs
corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into k topics using LDA
ldaModel = LDA.train(corpus, k=30)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
# for topic in range(3):
#     print("Topic " + str(topic) + ":")
#     for word in range(0, ldaModel.vocabSize()):
#         print(" " + str(topics[word]))

import numpy

示例#58

0

显示文件

#
located = remapped.map(lambda (d, h, l): (locate(l, \
spatial.KDTree(array( \
[[37.7816834,-122.3887657],\
[37.7469112,-122.4821759],\
[37.7411022,-120.804151],\
[37.4834543,-122.3187302],\
[37.7576436,-122.3916382],\
[37.7970013,-122.4140409],\
[37.748496,-122.4567461],\
[37.7288155,-122.4210133],\
[37.5839487,-121.9499339],\
[37.7157156,-122.4145311],\
[37.7329613,-122.5051491],\
[37.7575891,-122.3923824],\
[37.7521169,-122.4497687]])),
                                                 ["SF18", "SF04", "SF15", "SF17", "SF36", "SF37",\
"SF07", "SF11", "SF12", "SF14", "SF16", "SF19", "SF34"] ),d,h))

counted = located.map(lambda (l, d, h): ((l, d, h), 1))
incidentsreduced = counted.reduceByKey(lambda a, b: a + b)

joined = windaveraged.join(incidentsreduced)

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

vecs = joined.map(lambda ((s, d, h), ((t, w), i)): Vectors.dense([t, w, i]))
print(Statistics.corr(vecs))

示例#59

0

显示文件

 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
 )
 print("Start Creating Customer Preferences Block Matrix")
 print(
     "###################################################################################################"
 )
 index_ct = customer_persona.drop("analytic_id")
 index_anaId = customer_persona.select("id", "analytic_id")
 index_ct.registerTempTable("index_ct")
 ontop_pref_price = ontop_preferences.select("id", "Price_XS", "Price_S",
                                             "Price_M", "Price_L",
                                             "Price_XL")
 ontop_pref_price = ontop_pref_price.orderBy(asc("id"))
 bmB_1 = IndexedRowMatrix(
     ontop_pref_price.rdd.map(lambda x: IndexedRow(x[0], Vectors.dense(x[
         1:])))).toBlockMatrix(rowsPerBlock=222)
 count = customer_persona.count()
 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
 )
 print("Finished Creating Customer Preferences Block Matrix")
 print(
     "###################################################################################################"
 )
 loop = int(count / 200000)
 startId = 1
 i = 0
 res = index_ct
 del customer_persona
 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"

示例#60

0

显示文件

from pyspark import SparkConf, SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayesModel, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.rdd import RDD

conf = SparkConf().setAppName("myApp").setMaster("local")
sc = SparkContext(conf=conf)

vMale = Vectors.dense(1, 0, 1, 0, 1, 0)
length = 6
index = [0, 1, 2, 3, 5]
values = [1, 1, 1, 1, 1]
vFemale = Vectors.sparse(length, index, values)

train_one = LabeledPoint(1.0, vMale)
train_two = LabeledPoint(2.0, vFemale)
train_three = LabeledPoint(2.0, Vectors.dense(0, 1, 1, 1, 0, 1))

trains = list()
trains.append(train_one)
trains.append(train_two)
trains.append(train_three)
trainingRDD = sc.parallelize(trains)
nb = NaiveBayes()
nb_model = NaiveBayes.train(trainingRDD)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

dTest = [0, 1, 1, 0, 0, 1]