Python Vectors.sparse示例，pyspark.mllib.linalg.Vectors.sparse Python示例

示例#1

0

显示文件

文件： test_util.py 项目： drewrobb/spark

 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)

示例#2

0

显示文件

文件： tests.py 项目： greatyan/spark

 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])

示例#3

0

显示文件

文件： consume_profiles_spark_2.py 项目： Froskekongen/content-consumption

def parseEntry(xx):

    mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
    xx=xx.split('\t')
    a_virtual=xx[0]
    browser=xx[1]
    referrer=xx[2]
    a_user_key=xx[3]
    try:
        birthyear=int(xx[4])
        age=2015-birthyear
    except Exception as _:
        birthyear=xx[4]
        age=-1
    gender=xx[5]
    #print(xx)
    #print(xx[6])
    if xx[6]!='NAN':
        reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
    else:
        reg_date=mindate
    device=xx[7]
    date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
    tdiff=datetime.timedelta(hours=int(xx[9]))
    date=date+tdiff
    year=date.year
    month=date.month
    day=date.day
    hour=int(xx[9])
    weekday=date.weekday()

    if reg_date>mindate:
        days_since_registration=(date-reg_date).days
    else:
        days_since_registration=-1

    metrics=list([int(x.replace(',0','')) for x in xx[10:]])
    visits=metrics[0]
    visits_betalt=metrics[1]
    pageviews=metrics[2]
    pageview_nothome=metrics[3]
    pageview_betalt=metrics[4]

    timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
    timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])

    return Row(browser=browser,a_user_key=a_user_key,age=age,\
               day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
               pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
               device=device,gender=gender,days_since_registration=days_since_registration,\
               reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
               a_virtual=a_virtual)

示例#4

0

显示文件

文件： converter_test.py 项目： Anhmike/spark-sklearn

 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)

示例#5

0

显示文件

文件： run_v2.py 项目： lijiahong/spark_clustering

def add_svec(sv1, sv2):
    assert len(sv1) == len(sv2), "dimension mismatch"
    indices = []
    values = []
    i, j = 0, 0
    while i < len(sv1.indices) and j < len(sv2.indices):
        if sv1.indices[i] == sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i] + sv2.values[j])
            i += 1
            j += 1
        elif sv1.indices[i] < sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i])
            i += 1
        else:
            indices.append(sv2.indices[j])
            values.append(sv2.values[j])
            j += 1
    while i < len(sv1.indices):
        indices.append(sv1.indices[i])
        values.append(sv1.values[i])
        i += 1
    while j < len(sv2.indices):
        indices.append(sv2.indices[j])
        values.append(sv2.values[j])
        j += 1
    return Vectors.sparse(len(sv1), indices, values)

示例#6

0

显示文件

文件： Qn8.py 项目： shaileshr/SentimentAnalysis

def createSparseVector(histogram):
	indexList = []
	countList = []
	for histogramIndex, count in sorted(histogram, key=getKey):
		indexList.append(histogramIndex)
		countList.append(count)
	return Vectors.sparse(2000, indexList,countList)

示例#7

0

显示文件

文件： tests.py 项目： bsangee/spark

 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass

示例#8

0

显示文件

文件： GMMclustering.py 项目： FlytxtRnD/GMM

    def scoreOnePoint(self, x):

        """
        Compute the log likelihood of 'x' being generated under the current model
        Also returns the probability that 'x' is generated by each component of the mixture

        Parameters
        ----------
        x : array of shape (1,  n_dim)
            Corresponds to a single data point.

        Returns
        -------
        log_likelihood_x :Log likelihood  of 'x'
        prob_x : Resposibility  of each cluster for the data point 'x'

        """
        lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights))
        log_likelihood_x = logsumexp(lpr)
        prob_x = np.exp(lpr-log_likelihood_x)

        if self.isSparse == 1:
            temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :])
            sqVec = Vectors.sparse(x.size, x.indices, x.values**2)
            temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :])

        else:
            temp_wt = np.dot(prob_x.T[:, np.newaxis],  x[np.newaxis, :])
            temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :])

        return log_likelihood_x, prob_x, temp_wt, temp_avg

示例#9

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)

示例#10

0

显示文件

文件： tests.py 项目： Bella-Lin/spark

 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)

示例#11

0

显示文件

文件： tf_idf.py 项目： lijiahong/spark_clustering

def load_cut_to_rdd(input_file, result_file):
    sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
    terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
    num_term = len(terms_list)

    term_idf = doc_term_tf.map(
            lambda ((tid, term), tf): (term, 1.0)
            ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
    tfidf_join = doc_term_tf.map(
            lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()

    nonzero_count = 0
    f = open(result_file,'w')
    f.write('%s %s\r\n'%(num_doc, num_term))
    for (tid, feature) in doc_vec.collect():
        for num in feature:
            f.write(str(num)+"\t")
        f.write("\n")
    f.close()
    sc.stop()


    return

示例#12

0

显示文件

文件： util.py 项目： Amir-Github/spark

    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
        feature vector using the following format:

        label index1:value1 index2:value2 ...

        where the indices are one-based and in ascending order. This
        method parses each line into a LabeledPoint, where the feature
        indices are converted to zero-based.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
                            already split into multiple files and you
                            want to load them separately, because some
                            features may not present in certain files,
                            which leads to inconsistent feature
                            dimensions.
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> type(examples[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.0,(6,[0,2,4],[1.0,2.0,3.0]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (-1.0,(6,[],[]))
        >>> type(examples[2]) == LabeledPoint
        True
        >>> print examples[2]
        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
        """
        from pyspark.mllib.regression import LabeledPoint
        if multiclass is not None:
            warnings.warn("deprecated", DeprecationWarning)

        lines = sc.textFile(path, minPartitions)
        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))

示例#13

0

显示文件

文件： util.py 项目： chenc10/Spark-PAF-INFOCOM18

    def loadLibSVMFile(sc,
                       path,
                       numFeatures=-1,
                       minPartitions=None,
                       multiclass=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
        feature vector using the following format:

        label index1:value1 index2:value2 ...

        where the indices are one-based and in ascending order. This
        method parses each line into a LabeledPoint, where the feature
        indices are converted to zero-based.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
                            already split into multiple files and you
                            want to load them separately, because some
                            features may not present in certain files,
                            which leads to inconsistent feature
                            dimensions.
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> examples[0]
        LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))
        >>> examples[1]
        LabeledPoint(-1.0, (6,[],[]))
        >>> examples[2]
        LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))
        """
        from pyspark.mllib.regression import LabeledPoint
        if multiclass is not None:
            warnings.warn("deprecated", DeprecationWarning)

        lines = sc.textFile(path, minPartitions)
        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(
                lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
        return parsed.map(lambda x: LabeledPoint(
            x[0], Vectors.sparse(numFeatures, x[1], x[2])))

示例#14

0

显示文件

文件： ParseWikipedia.py 项目： tdungpfiev/advanced-analytics-spark

 def termFreqsMapper(termFreqs):
     docTotalTerms = sum(termFreqs.values())
     termFreqsFiltered = filter(lambda tf: tf[0] in bIdTerms,
                                termFreqs.items())
     termScores = map(
         lambda tf:
         (bIdTerms[tf[0]], bIdfs[tf[0]] * termFreqs[tf[0]] / docTotalTerms),
         termFreqsFiltered)
     return Vectors.sparse(len(bIdTerms), termScores)

示例#15

0

显示文件

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))

示例#16

0

显示文件

文件： transformers.py 项目： ngarneau/sentiment-analysis

 def _get_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (
         "I dont know why people think this is such a bad movie.",
         Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
         ),
     ]
     return sql_context.createDataFrame(l, ['text', 'features'])

示例#17

0

显示文件

文件： sparkpytrain.py 项目： laisj/Toolkit

def mkFeatureVector(idxSizeArr):
    tempSize = 0
    featureArr = []
    valueArr = []
    for i in idxSizeArr:
        featureArr.append(i[0] + tempSize)
        valueArr.append(1)
        tempSize += i[1]
    return Vectors.sparse(tempSize, featureArr, valueArr)

示例#18

0

显示文件

文件： matrixoperation.py 项目： xinwang26/pyspark

def vectorDFtoIndexedMatrix(df, vecvar, idcol):
    '''
	applicable to dataframe already having assembled vectors
	'''
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar].
                            values)))
    return IndexedRowMatrix(df)

示例#19

0

显示文件

文件： tests.py 项目： HodaAlemi/spark

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))

示例#20

0

显示文件

文件： image-classifcation-train.py 项目： yiming1012/mlsql

def vectorize(fc):
    if "size" not in fc and "type" not in fc:
        sv = fc
    elif "size" not in fc and "type" in fc and fc["type"] == 1:
        sv = fc["values"]
    else:
        sv = Vectors.sparse(fc["size"], list(zip(fc["indices"],
                                                 fc["values"]))).toArray()
    return sv

示例#21

0

显示文件

def score_and_peptide(peptide, query_peaks_bc):
    """
    Given a peptide and a query, perform a dot product
    """
    # get max vector size based on bins
    peptide_mz_bins = [peak[0] for peak in peptide[1]]
    query_mz_bins = [peak[0] for peak in query_peaks_bc.value]
    max_bin_peptide = max(peptide_mz_bins) + 1
    max_bin_query = max(query_mz_bins) + 1
    max_size = max(max_bin_peptide, max_bin_query)

    # Create SparseVector for peptide
    peptide_sv = Vectors.sparse(max_size, peptide[1])

    # Create a SparseVector the query
    query_sv = Vectors.sparse(max_size, query_peaks_bc.value)

    # return peptide and dot product result
    return (peptide[0], peptide_sv.dot(query_sv))

示例#22

0

显示文件

    def _pre_dot(self, A):
        size = self.size

        a = A.entries.map(
            lambda entry: (entry.j, (entry.i, entry.value))
        ).groupByKey() \
        .map(
            lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda x: x[0])))))
        )
        return a

示例#23

0

显示文件

 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])

示例#24

0

显示文件

文件： engine.py 项目： NunoEdgarGub1/olssen

def score_and_peptide(peptide, query_peaks_bc):
    """
    Given a peptide and a query, perform a dot product
    """
    # get max vector size based on bins
    peptide_mz_bins = [peak[0] for peak in peptide[1]]
    query_mz_bins = [peak[0] for peak in query_peaks_bc.value]
    max_bin_peptide = max(peptide_mz_bins)+1
    max_bin_query = max(query_mz_bins)+1
    max_size = max(max_bin_peptide,max_bin_query)

    # Create SparseVector for peptide
    peptide_sv = Vectors.sparse(max_size, peptide[1])

    # Create a SparseVector the query 
    query_sv = Vectors.sparse(max_size, query_peaks_bc.value)

    # return peptide and dot product result
    return (peptide[0], peptide_sv.dot(query_sv))

示例#25

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])

示例#26

0

显示文件

文件： lda.py 项目： lbynum/twitter-live-sentiment

def document_vector(document):
    id = document[1]
    counts = defaultdict(int)
    for token in document[0]:
        if token in vocabulary:
            token_id = vocabulary[token]
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))

示例#27

0

显示文件

def create_sparse_vector(row):
    term = row[0]
    if type(row[1][0]) is int:
        dictionary = {}
        dictionary[row[1][0]] = row[1][1]
    else:
        documents = row[1]
        dictionary = {}
        for document in documents:
            dictionary[document[0]] = document[1]
    return Vectors.sparse(N, dictionary)

示例#28

0

显示文件

文件： parth_vaghani_Kmeans.py 项目： parthvaghani1995/Data-Mining

def dividev(v1, n):
    indices = set(v1.indices)
    v1d = dict(zip(v1.indices, v1.values))
    #     print(v1d)
    zero = np.float64(0)
    #     print(zero)
    values = {
        i: v1d.get(i, zero) / n
        for i in indices if v1d.get(i, zero) / n != zero
    }
    return Vectors.sparse(v1.size, values)

示例#29

0

显示文件

def sparse_vector_mul(v1, v2):
    if not (is_none_or_instance(v1, SparseVector) and
            is_none_or_instance(v2, SparseVector)):
        raise TypeError('v1 and v2 are not SparseVectors')
    if v1.size != v2.size:
        raise ValueError('v1 and v2 are not of same size')
    d1 = dict(zip(v1.indices, v1.values))
    d2 = dict(zip(v2.indices, v2.values))
    indices = sorted(list(set(v1.indices) & set(v2.indices)))
    values = [d1[i] * d2[i] for i in indices]
    return Vectors.sparse(v1.size, indices, values)

示例#30

0

显示文件

 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))

示例#31

0

显示文件

文件： test_feature.py 项目： Brett-A/spark

 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))

示例#32

0

显示文件

def sparse_vector_add(v1, v2):
    if not (is_none_or_instance(v1, SparseVector) and
            is_none_or_instance(v2, SparseVector)):
        raise TypeError('v1 and v2 are not SparseVectors')
    if v1.size != v2.size:
        raise ValueError('v1 and v2 are not of same size')
    d1 = dict(zip(v1.indices, v1.values))
    d2 = dict(zip(v2.indices, v2.values))
    zero = NP.float64(0)
    indices = sorted(list(set(v1.indices) | set(v2.indices)))
    values = [d1.get(i, zero) + d2.get(i, zero) for i in indices]
    return Vectors.sparse(v1.size, indices, values)

示例#33

0

显示文件

 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]), ),
             (Vectors.sparse(2, {
                 0: 0.3,
                 1: 0.4
             }), ), (Vectors.sparse(2, {
                 0: 0.5,
                 1: 0.6
             }), )]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(
         isinstance(pd.features[0], csr_matrix),
         "Expected pd.features[0] to be csr_matrix but found: %s" %
         type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0, 0], 0.1)
     self.assertEqual(pd.features[0][0, 1], 0.2)

示例#34

0

显示文件

def load_sparse_data():
    tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"]
    # train the model on the new data for a few epochs
    datafiles = [
        file for file in os.listdir(tempDataLocalPath)
        if file.endswith(".json")
    ]
    row_n = []
    col_n = []
    data_n = []
    y = []
    feature_size = 0
    row_index = 0
    for file in datafiles:
        with open(tempDataLocalPath + "/" + file) as f:
            for line in f.readlines():
                obj = json.loads(line)
                fc = obj[featureCol]
                if "size" not in fc and "type" not in fc:
                    feature_size = len(fc)
                    dic = [(i, a) for i, a in enumerate(fc)]
                    sv = SparseVector(len(fc), dic)
                elif "size" not in fc and "type" in fc and fc["type"] == 1:
                    values = fc["values"]
                    feature_size = len(values)
                    dic = [(i, a) for i, a in enumerate(values)]
                    sv = SparseVector(len(values), dic)

                else:
                    feature_size = fc["size"]
                    sv = Vectors.sparse(fc["size"],
                                        list(zip(fc["indices"], fc["values"])))

                for c in sv.indices:
                    row_n.append(row_index)
                    col_n.append(c)
                    data_n.append(sv.values[list(sv.indices).index(c)])

                if type(obj[labelCol]) is list:
                    y.append(np.array(obj[labelCol]).argmax())
                else:
                    y.append(obj[labelCol])
                row_index += 1
                if row_index % 10000 == 0:
                    print("processing lines: %s, values: %s" %
                          (str(row_index), str(len(row_n))))
                    # sys.stdout.flush()
    print("X matrix : %s %s  row_n:%s col_n:%s classNum:%s" %
          (row_index, feature_size, len(row_n), len(col_n), ",".join(
              [str(i) for i in list(set(y))])))
    sys.stdout.flush()
    return sp.csc_matrix((data_n, (row_n, col_n)),
                         shape=(row_index, feature_size)), y

示例#35

0

显示文件

文件： bag_of_words.py 项目： gurpreetbajwa/Sentiment-Analysis

def get_sparseVector(x):
    ids=[]
    for j in x:
        if j in cluster.keys():
            ids.append(cluster[j])

    bag_words = {}
    for i in ids:
        bag_words[i]=(float(ids.count(i))/len(ids))
     # Create a SparseVector
    sv = Vectors.sparse(2000, bag_words)
    return sv

示例#36

0

显示文件

文件： feature.py 项目： Fantasticer/translate-spark

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency vectors,
        or transform the RDD of document to RDD of term frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

示例#37

0

显示文件

 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")

示例#38

0

显示文件

文件： parth_vaghani_Kmeans.py 项目： parthvaghani1995/Data-Mining

def add(v1, v2):
    assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
    assert v1.size == v2.size
    indices = set(v1.indices).union(set(v2.indices))
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    values = {
        i: v1d.get(i, zero) + v2d.get(i, zero)
        for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero
    }

    return Vectors.sparse(v1.size, values)

示例#39

0

显示文件

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency vectors,
        or transform the RDD of document to RDD of term frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

示例#40

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")

示例#41

0

显示文件

def make_dataFrame(my_dict1, my_dict2, most_list):
    fre_keys = my_dict1.keys()
    index_list = list()

    for i in range(len(most_list)):
        for j in range(len(fre_keys)):
            if most_list[i] == fre_keys[j]:
                index_list.append(i)
            
    fre_keys2 = my_dict2.keys()
    index_list2 = list()

    for i in range(len(most_list)):
        for j in range(len(fre_keys2)):
            if most_list[i] == fre_keys2[j]:
                index_list2.append(i)

    #print index_list
    #print index_list2

    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.linalg import Vectors

    tmp_dict = dict()
    for i in range(len(index_list)):
        tmp_dict[index_list[i]] = my_dict1[most_list[index_list[i]]]

    tmp_dict2 = dict()
    for i in range(len(index_list2)):
        tmp_dict2[index_list2[i]] = my_dict2[most_list[index_list2[i]]]

    p = [
        LabeledPoint(1, Vectors.sparse(20, tmp_dict )),
        LabeledPoint(0, Vectors.sparse(20, tmp_dict2 ))
    ]

    trainDf = spark.createDataFrame(p)
    trainDf.show()

示例#42

0

显示文件

 def test_ml_mllib_vector_conversion(self):
     # to ml
     # dense
     mllibDV = Vectors.dense([1, 2, 3])
     mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
     mlDV2 = mllibDV.asML()
     self.assertEqual(mlDV2, mlDV1)
     # sparse
     mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV2 = mllibSV.asML()
     self.assertEqual(mlSV2, mlSV1)
     # from ml
     # dense
     mllibDV1 = Vectors.dense([1, 2, 3])
     mlDV = newlinalg.Vectors.dense([1, 2, 3])
     mllibDV2 = Vectors.fromML(mlDV)
     self.assertEqual(mllibDV1, mllibDV2)
     # sparse
     mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mllibSV2 = Vectors.fromML(mlSV)
     self.assertEqual(mllibSV1, mllibSV2)

示例#43

0

显示文件

def convertToSparse(a):

    itemNo=[]
    item=[]
    j=0

    for i in a:
       j=j+1
       if j%2==0:
           item.append(int(i))
       else:
           itemNo.append(int(i)+7)

    return Vectors.sparse(43,itemNo,item)

示例#44

0

显示文件

 def _dot1(self, S):
     if self.size != S.numRows():
         raise Exception(
             f"size mismatch ({self.size},) and ({S.numRows()},{S.numCols()})"
         )
     size = self.size
     v = self.rdd.map(lambda entry: (1, entry)).groupByKey().map(lambda x: (
         x[0],
         Vectors.sparse(
             size, *list(zip(*sorted(x[1].data, key=lambda y: y[0]))))))
     a = self._pre_dot(S)
     c = v.cartesian(a).map(lambda x: (x[1][0], float(x[0][1].dot(x[1][1])))
                            ).filter(lambda entry: entry[1] != 0.0)
     return SparseDistributedVector(c, S.numCols())

示例#45

0

显示文件

文件： PCAUtils.py 项目： caunion/sparkFA

 def sparseVectorTimesMatrixAlloc(sparseVector, matrix):
     matrixCols = matrix.shape[1]
     tupleList = {}
     for col in range(matrixCols):
         indices = sparseVector.indices
         dotRes = 0
         for index in indices:
             value = sparseVector[index]
             dotRes += matrix[index, col] * value
         if ( abs(dotRes) > PCAUtils.zero):
             tupleList[col] = dotRes
     # alloc space for the sparse vector
     sparseRet = Vectors.sparse(matrixCols, tupleList)
     return sparseRet

示例#46

0

显示文件

文件： matrixoperation.py 项目： xinwang26/pyspark

def DFtoIndexedMatrix(df, quantvars, idcol):
    '''
	convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors
	'''
    df = VectorAssembler(
        inputCols=quantvars, outputCol="features"
    ).transform(df).select(
        [idcol, "features"]
    )  #vector assembler turn it automatically to sparse matrix, so next line should be fine
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row.features.size, row.features.indices, row.
                            features.values)))
    return IndexedRowMatrix(df)

示例#47

0

显示文件

文件： tests.py 项目： yoavfreund/spark

 def test_linear_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame(
         [(1.0, 2.0, Vectors.dense(1.0)),
          (0.0, 2.0, Vectors.sparse(1, [], []))],
         ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5,
                           regParam=0.0,
                           solver="normal",
                           weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(
         isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     devResiduals = s.devianceResiduals
     self.assertTrue(
         isinstance(devResiduals, list)
         and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(
         isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(
         isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(
         isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance,
                            s.explainedVariance)

示例#48

0

显示文件

文件： pyspark_lda_pipeline.py 项目： shredmore/twitter_fakestnews

def get_sparse_vectors(documents):
    # get unique word list
    WordList = const_unique_word_list(documents)
    # convert to counter object
    documents_word_counter = documents.map(lambda words: sorted(
        Counter([WordList.index(word) for word in words
                 if word in WordList]).items(),
        key=lambda pair: pair[0],
        reverse=False))
    # convert to sparse vector
    documents_sparse_vectors = documents_word_counter.map(
        lambda counter: Vectors.sparse(len(WordList),
                                       tuple([pair[0] for pair in counter]),
                                       tuple([pair[1] for pair in counter])))
    return documents_sparse_vectors

示例#49

0

显示文件

def docTopics(filepath, topicMatrix):
    # for each topic
    # sum probablity of words in corpus
    # normalize so that probility of topics sum to 1

    n_vcb = topicMatrix.shape[0]
    data = sc.textFile(filepath)
    parsedData = data.map(lambda line: line.strip().split(' ')).map(
        lambda x: (int(x[0]) - 1,
                   (int(x[1]) - 1, float(x[2])))).groupByKey().mapValues(list)
    corpus = parsedData.map(
        lambda x:
        [x[0], normalize(Vectors.sparse(n_vcb, x[1]).dot(topicMatrix))])

    return corpus.collect()

示例#50

0

显示文件

    def _dot2(self,v):
        if self.numCols() != v.size:
            raise Exception(f"size mismatch ({self.numRows()},{self.numCols()}) and ({v.size},)")
        size = v.size
        sv = v.rdd.map(lambda entry: (1, entry)).groupByKey().map(
            lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda y: y[0])))))
        )
        a = self._pre_dot(self, size = size)

        c = sv.cartesian(a).map(
            lambda x: (x[1][0], float(x[0][1].dot(x[1][1])))
        ).filter(
            lambda entry: entry[1] != 0.0
        )
        return sdv.SparseDistributedVector(c, self.numRows())

示例#51

0

显示文件

    def test_apply_binary_term_freqs(self):
        sqlContext = SQLContext(self.sc)

        df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 100
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.sparse(n, {(ord("a") % n): 1.0,
                                      (ord("b") % n): 1.0,
                                      (ord("c") % n): 1.0}).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))

示例#52

0

显示文件

文件： kMeans.py 项目： nitinsaroha/kmeans-clustering

def add(v1, v2):
    """Add two sparse vectors
    >>> v1 = Vectors.sparse(3, {0: 1.0, 2: 1.0})
    >>> v2 = Vectors.sparse(3, {1: 1.0})
    >>> add(v1, v2)
    SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0})
    """
    assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
    assert v1.size == v2.size
    indices = set(v1.indices).union(set(v2.indices))
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    values = {i: v1d.get(i, zero) + v2d.get(i, zero) for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero}

    return Vectors.sparse(v1.size, values)

示例#53

0

显示文件

文件： train.py 项目： allwefantasy/streamingpro

def load_sparse_data():
    tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"]
    # train the model on the new data for a few epochs
    datafiles = [file for file in os.listdir(tempDataLocalPath) if file.endswith(".json")]
    row_n = []
    col_n = []
    data_n = []
    y = []
    feature_size = 0
    row_index = 0
    for file in datafiles:
        with open(tempDataLocalPath + "/" + file) as f:
            for line in f.readlines():
                obj = json.loads(line)
                fc = obj[featureCol]
                if "size" not in fc and "type" not in fc:
                    feature_size = len(fc)
                    dic = [(i, a) for i, a in enumerate(fc)]
                    sv = SparseVector(len(fc), dic)
                elif "size" not in fc and "type" in fc and fc["type"] == 1:
                    values = fc["values"]
                    feature_size = len(values)
                    dic = [(i, a) for i, a in enumerate(values)]
                    sv = SparseVector(len(values), dic)

                else:
                    feature_size = fc["size"]
                    sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"])))

                for c in sv.indices:
                    row_n.append(row_index)
                    col_n.append(c)
                    data_n.append(sv.values[list(sv.indices).index(c)])

                if type(obj[labelCol]) is list:
                    y.append(np.array(obj[labelCol]).argmax())
                else:
                    y.append(obj[labelCol])
                row_index += 1
                if row_index % 10000 == 0:
                    print("processing lines: %s, values: %s" % (str(row_index), str(len(row_n))))
                    # sys.stdout.flush()
    print("X matrix : %s %s  row_n:%s col_n:%s classNum:%s" % (
        row_index, feature_size, len(row_n), len(col_n), ",".join([str(i) for i in list(set(y))])))
    sys.stdout.flush()
    return sp.csc_matrix((data_n, (row_n, col_n)), shape=(row_index, feature_size)), y

示例#54

0

显示文件

文件： engine.py 项目： camlinke/movie-recommender

def get_top_movies_for_user(user_ratings=fake_user_ratings, ratings_rdd=ratingsRDD):
    movies_length = ratings_rdd.map(lambda x: x[1]).max() + 1
    user_ids_with_ratings_rdd = (ratings_rdd
                                 .map(lambda (user_id, movie_id, rating): (user_id, [(movie_id, rating)]))
                                 .reduceByKey(lambda a, b: a + b)
                                 .filter(lambda x: len(x[1]) > 25)
                                 .map(lambda x: (x[0], Vectors.sparse(movies_length, x[1]))))
    user_seen_movies_list = [x[0] for x in user_ratings]
    most_similar_for_user_rdd = create_most_similar_for_user_rdd(user_ratings, user_ids_with_ratings_rdd, movies_length)
    similar_users_and_similarity_rdd = create_similar_users_and_similarity_rdd(most_similar_for_user_rdd)
    top_movies_for_user = (user_ids_with_ratings_rdd
                           .join(similar_users_and_similarity_rdd)
                           .flatMap(lambda x: create_id_rating_tuples(x[1][1], x[1][0]))
                           .filter(lambda x: x[0] not in user_seen_movies_list)
                           .reduceByKey(lambda a, b: max(a, b))
                           .takeOrdered(100, lambda x: -x[1]))
    return top_movies_for_user

示例#55

0

显示文件

文件： Encoding.py 项目： yingcuhk/SparkPractice

	def add_hashed_features(self,df,num_hash_buckets = 2 ** 15):

		def hash_function(raw_feats, num_buckets, print_mapping=False):
			"""Calculate a feature dictionary for an observation's features based on hashing.

			Note:
				Use print_mapping=True for debug purposes and to better understand how the hashing works.

			Args:
				raw_feats (list of (int, str)): A list of features for an observation.  Represented as
					(featureID, value) tuples.
				num_buckets (int): Number of buckets to use as features.
				print_mapping (bool, optional): If true, the mappings of featureString to index will be
					printed.
			Returns:
				dict of int to float:  The keys will be integers which represent the buckets that the
					features have been hashed to.  The value for a given key will contain the count of the
					(featureID, value) tuples that have hashed to that key.
			"""
			mapping = {category + ':' + str(ind):
						   int(int(hashlib.md5(category + ':' + str(ind)).hexdigest(), 16) % num_buckets)
					   for ind, category in raw_feats}
			if (print_mapping): print mapping

			def map_update(l, r):
				l[r] += 1.0
				return l

			sparse_features = reduce(map_update, mapping.values(), defaultdict(float))
			return dict(sparse_features)
		"""Return a DataFrame with labels and hashed features.
		Note:
			Make sure to cache the DataFrame that you are returning.

		Args:
			df (DataFrame with 'tuples' column): A DataFrame containing the tuples to be hashed.

		Returns:
			DataFrame: A DataFrame with a 'label' column and a 'features' column that contains a
				SparseVector of hashed features.
		"""
		tuples_to_hash_features_udf = udf(lambda x: Vectors.sparse(num_hash_buckets, hash_function(x, num_hash_buckets)), VectorUDT())

		return df.select(df.label,tuples_to_hash_features_udf(df.features).alias("features")).cache()

示例#56

0

显示文件

文件： clustering_spark.py 项目： ramtinms/Benchmark_Bigdata_ML

def change_to_sparse(line):
    keys = []
    values = []
    #logger = logging.getLogger("py4j")
    #logger.setLevel(logging.INFO)
    #logger.addHandler(logging.StreamHandler())
    #logger.info("<><<><><><><><><>")

    temp = line.split("{")[1]
    temp = re.sub('}','',temp)
    for item in temp.split(',')[1:]:
        index = int(item.split(':')[0])
        value = float(item.split(':')[1])
        keys.append(index)
        values.append(value)
    #print keys
    #print values
    #logger.info(max(keys))
    return Vectors.sparse(Vocab_size, sorted(keys),values)

示例#57

0

显示文件

文件： GMMclustering.py 项目： FlytxtRnD/GMM

    def log_multivariate_normal_density_diag_Nd(self, x):
        """
        Compute Gaussian log-density at x for a diagonal model

        """

        n_features = x.size

        if self.isSparse == 1:
            t = Vectors.sparse(x.size, x.indices, x.values**2).dot((1/self.covarBc.value).T)

        else:
            t = np.dot(x**2, (1/self.covarBc.value).T)

        lpr = -0.5 * (n_features*np.log(2*np.pi) + np.sum(np.log(self.covarBc.value), 1) +
                      np.sum((self.meansBc.value ** 2) / self.covarBc.value, 1)
                      - 2 * x.dot((self.meansBc.value/self.covarBc.value).T) + t)

        return lpr

示例#58

0

显示文件

文件： tests.py 项目： A7mech/spark

 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)