Python Vectors.sparse 예제들, pyspark.mllib.linalg.Vectors.sparse Python 예제들

예제 #1

0

파일 보기

파일: test_util.py 프로젝트: drewrobb/spark

 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)

예제 #2

0

파일 보기

파일: tests.py 프로젝트: greatyan/spark

 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])

예제 #3

0

파일 보기

파일: consume_profiles_spark_2.py 프로젝트: Froskekongen/content-consumption

def parseEntry(xx):

    mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
    xx=xx.split('\t')
    a_virtual=xx[0]
    browser=xx[1]
    referrer=xx[2]
    a_user_key=xx[3]
    try:
        birthyear=int(xx[4])
        age=2015-birthyear
    except Exception as _:
        birthyear=xx[4]
        age=-1
    gender=xx[5]
    #print(xx)
    #print(xx[6])
    if xx[6]!='NAN':
        reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
    else:
        reg_date=mindate
    device=xx[7]
    date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
    tdiff=datetime.timedelta(hours=int(xx[9]))
    date=date+tdiff
    year=date.year
    month=date.month
    day=date.day
    hour=int(xx[9])
    weekday=date.weekday()

    if reg_date>mindate:
        days_since_registration=(date-reg_date).days
    else:
        days_since_registration=-1

    metrics=list([int(x.replace(',0','')) for x in xx[10:]])
    visits=metrics[0]
    visits_betalt=metrics[1]
    pageviews=metrics[2]
    pageview_nothome=metrics[3]
    pageview_betalt=metrics[4]

    timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
    timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])

    return Row(browser=browser,a_user_key=a_user_key,age=age,\
               day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
               pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
               device=device,gender=gender,days_since_registration=days_since_registration,\
               reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
               a_virtual=a_virtual)

예제 #4

0

파일 보기

파일: converter_test.py 프로젝트: Anhmike/spark-sklearn

 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)

예제 #5

0

파일 보기

파일: run_v2.py 프로젝트: lijiahong/spark_clustering

def add_svec(sv1, sv2):
    assert len(sv1) == len(sv2), "dimension mismatch"
    indices = []
    values = []
    i, j = 0, 0
    while i < len(sv1.indices) and j < len(sv2.indices):
        if sv1.indices[i] == sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i] + sv2.values[j])
            i += 1
            j += 1
        elif sv1.indices[i] < sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i])
            i += 1
        else:
            indices.append(sv2.indices[j])
            values.append(sv2.values[j])
            j += 1
    while i < len(sv1.indices):
        indices.append(sv1.indices[i])
        values.append(sv1.values[i])
        i += 1
    while j < len(sv2.indices):
        indices.append(sv2.indices[j])
        values.append(sv2.values[j])
        j += 1
    return Vectors.sparse(len(sv1), indices, values)

예제 #6

0

파일 보기

파일: Qn8.py 프로젝트: shaileshr/SentimentAnalysis

def createSparseVector(histogram):
	indexList = []
	countList = []
	for histogramIndex, count in sorted(histogram, key=getKey):
		indexList.append(histogramIndex)
		countList.append(count)
	return Vectors.sparse(2000, indexList,countList)

예제 #7

0

파일 보기

파일: tests.py 프로젝트: bsangee/spark

 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass

예제 #8

0

파일 보기

파일: GMMclustering.py 프로젝트: FlytxtRnD/GMM

    def scoreOnePoint(self, x):

        """
        Compute the log likelihood of 'x' being generated under the current model
        Also returns the probability that 'x' is generated by each component of the mixture

        Parameters
        ----------
        x : array of shape (1,  n_dim)
            Corresponds to a single data point.

        Returns
        -------
        log_likelihood_x :Log likelihood  of 'x'
        prob_x : Resposibility  of each cluster for the data point 'x'

        """
        lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights))
        log_likelihood_x = logsumexp(lpr)
        prob_x = np.exp(lpr-log_likelihood_x)

        if self.isSparse == 1:
            temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :])
            sqVec = Vectors.sparse(x.size, x.indices, x.values**2)
            temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :])

        else:
            temp_wt = np.dot(prob_x.T[:, np.newaxis],  x[np.newaxis, :])
            temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :])

        return log_likelihood_x, prob_x, temp_wt, temp_avg

예제 #9

0

파일 보기

파일: tests.py 프로젝트: A7mech/spark

 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)

예제 #10

0

파일 보기

파일: tests.py 프로젝트: Bella-Lin/spark

 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)

예제 #11

0

파일 보기

파일: tf_idf.py 프로젝트: lijiahong/spark_clustering

def load_cut_to_rdd(input_file, result_file):
    sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
    terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
    num_term = len(terms_list)

    term_idf = doc_term_tf.map(
            lambda ((tid, term), tf): (term, 1.0)
            ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
    tfidf_join = doc_term_tf.map(
            lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()

    nonzero_count = 0
    f = open(result_file,'w')
    f.write('%s %s\r\n'%(num_doc, num_term))
    for (tid, feature) in doc_vec.collect():
        for num in feature:
            f.write(str(num)+"\t")
        f.write("\n")
    f.close()
    sc.stop()


    return

예제 #12

0

파일 보기

파일: util.py 프로젝트: Amir-Github/spark

    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
        feature vector using the following format:

        label index1:value1 index2:value2 ...

        where the indices are one-based and in ascending order. This
        method parses each line into a LabeledPoint, where the feature
        indices are converted to zero-based.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
                            already split into multiple files and you
                            want to load them separately, because some
                            features may not present in certain files,
                            which leads to inconsistent feature
                            dimensions.
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> type(examples[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.0,(6,[0,2,4],[1.0,2.0,3.0]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (-1.0,(6,[],[]))
        >>> type(examples[2]) == LabeledPoint
        True
        >>> print examples[2]
        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
        """
        from pyspark.mllib.regression import LabeledPoint
        if multiclass is not None:
            warnings.warn("deprecated", DeprecationWarning)

        lines = sc.textFile(path, minPartitions)
        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))

예제 #13

0

파일 보기

파일: util.py 프로젝트: chenc10/Spark-PAF-INFOCOM18

    def loadLibSVMFile(sc,
                       path,
                       numFeatures=-1,
                       minPartitions=None,
                       multiclass=None):
        """
        Loads labeled data in the LIBSVM format into an RDD of
        LabeledPoint. The LIBSVM format is a text-based format used by
        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
        feature vector using the following format:

        label index1:value1 index2:value2 ...

        where the indices are one-based and in ascending order. This
        method parses each line into a LabeledPoint, where the feature
        indices are converted to zero-based.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param numFeatures: number of features, which will be determined
                            from the input data if a nonpositive value
                            is given. This is useful when the dataset is
                            already split into multiple files and you
                            want to load them separately, because some
                            features may not present in certain files,
                            which leads to inconsistent feature
                            dimensions.
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> from pyspark.mllib.regression import LabeledPoint
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
        >>> tempFile.flush()
        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
        >>> tempFile.close()
        >>> examples[0]
        LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))
        >>> examples[1]
        LabeledPoint(-1.0, (6,[],[]))
        >>> examples[2]
        LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))
        """
        from pyspark.mllib.regression import LabeledPoint
        if multiclass is not None:
            warnings.warn("deprecated", DeprecationWarning)

        lines = sc.textFile(path, minPartitions)
        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
        if numFeatures <= 0:
            parsed.cache()
            numFeatures = parsed.map(
                lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
        return parsed.map(lambda x: LabeledPoint(
            x[0], Vectors.sparse(numFeatures, x[1], x[2])))

예제 #14

0

파일 보기

파일: ParseWikipedia.py 프로젝트: tdungpfiev/advanced-analytics-spark

 def termFreqsMapper(termFreqs):
     docTotalTerms = sum(termFreqs.values())
     termFreqsFiltered = filter(lambda tf: tf[0] in bIdTerms,
                                termFreqs.items())
     termScores = map(
         lambda tf:
         (bIdTerms[tf[0]], bIdfs[tf[0]] * termFreqs[tf[0]] / docTotalTerms),
         termFreqsFiltered)
     return Vectors.sparse(len(bIdTerms), termScores)

예제 #15

0

파일 보기

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))

예제 #16

0

파일 보기

파일: transformers.py 프로젝트: ngarneau/sentiment-analysis

 def _get_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (
         "I dont know why people think this is such a bad movie.",
         Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
         ),
     ]
     return sql_context.createDataFrame(l, ['text', 'features'])

예제 #17

0

파일 보기

파일: sparkpytrain.py 프로젝트: laisj/Toolkit

def mkFeatureVector(idxSizeArr):
    tempSize = 0
    featureArr = []
    valueArr = []
    for i in idxSizeArr:
        featureArr.append(i[0] + tempSize)
        valueArr.append(1)
        tempSize += i[1]
    return Vectors.sparse(tempSize, featureArr, valueArr)

예제 #18

0

파일 보기

파일: matrixoperation.py 프로젝트: xinwang26/pyspark

def vectorDFtoIndexedMatrix(df, vecvar, idcol):
    '''
	applicable to dataframe already having assembled vectors
	'''
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar].
                            values)))
    return IndexedRowMatrix(df)

예제 #19

0

파일 보기

파일: tests.py 프로젝트: HodaAlemi/spark

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))

예제 #20

0

파일 보기

파일: image-classifcation-train.py 프로젝트: yiming1012/mlsql

def vectorize(fc):
    if "size" not in fc and "type" not in fc:
        sv = fc
    elif "size" not in fc and "type" in fc and fc["type"] == 1:
        sv = fc["values"]
    else:
        sv = Vectors.sparse(fc["size"], list(zip(fc["indices"],
                                                 fc["values"]))).toArray()
    return sv

예제 #21

0

파일 보기

def score_and_peptide(peptide, query_peaks_bc):
    """
    Given a peptide and a query, perform a dot product
    """
    # get max vector size based on bins
    peptide_mz_bins = [peak[0] for peak in peptide[1]]
    query_mz_bins = [peak[0] for peak in query_peaks_bc.value]
    max_bin_peptide = max(peptide_mz_bins) + 1
    max_bin_query = max(query_mz_bins) + 1
    max_size = max(max_bin_peptide, max_bin_query)

    # Create SparseVector for peptide
    peptide_sv = Vectors.sparse(max_size, peptide[1])

    # Create a SparseVector the query
    query_sv = Vectors.sparse(max_size, query_peaks_bc.value)

    # return peptide and dot product result
    return (peptide[0], peptide_sv.dot(query_sv))

예제 #22

0

파일 보기

    def _pre_dot(self, A):
        size = self.size

        a = A.entries.map(
            lambda entry: (entry.j, (entry.i, entry.value))
        ).groupByKey() \
        .map(
            lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda x: x[0])))))
        )
        return a

예제 #23

0

파일 보기

 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])

예제 #24

0

파일 보기

파일: engine.py 프로젝트: NunoEdgarGub1/olssen

def score_and_peptide(peptide, query_peaks_bc):
    """
    Given a peptide and a query, perform a dot product
    """
    # get max vector size based on bins
    peptide_mz_bins = [peak[0] for peak in peptide[1]]
    query_mz_bins = [peak[0] for peak in query_peaks_bc.value]
    max_bin_peptide = max(peptide_mz_bins)+1
    max_bin_query = max(query_mz_bins)+1
    max_size = max(max_bin_peptide,max_bin_query)

    # Create SparseVector for peptide
    peptide_sv = Vectors.sparse(max_size, peptide[1])

    # Create a SparseVector the query 
    query_sv = Vectors.sparse(max_size, query_peaks_bc.value)

    # return peptide and dot product result
    return (peptide[0], peptide_sv.dot(query_sv))

예제 #25

0

파일 보기

파일: tests.py 프로젝트: A7mech/spark

 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])

예제 #26

0

파일 보기

파일: lda.py 프로젝트: lbynum/twitter-live-sentiment

def document_vector(document):
    id = document[1]
    counts = defaultdict(int)
    for token in document[0]:
        if token in vocabulary:
            token_id = vocabulary[token]
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))

예제 #27

0

파일 보기

def create_sparse_vector(row):
    term = row[0]
    if type(row[1][0]) is int:
        dictionary = {}
        dictionary[row[1][0]] = row[1][1]
    else:
        documents = row[1]
        dictionary = {}
        for document in documents:
            dictionary[document[0]] = document[1]
    return Vectors.sparse(N, dictionary)

예제 #28

0

파일 보기

파일: parth_vaghani_Kmeans.py 프로젝트: parthvaghani1995/Data-Mining

def dividev(v1, n):
    indices = set(v1.indices)
    v1d = dict(zip(v1.indices, v1.values))
    #     print(v1d)
    zero = np.float64(0)
    #     print(zero)
    values = {
        i: v1d.get(i, zero) / n
        for i in indices if v1d.get(i, zero) / n != zero
    }
    return Vectors.sparse(v1.size, values)

예제 #29

0

파일 보기

def sparse_vector_mul(v1, v2):
    if not (is_none_or_instance(v1, SparseVector) and
            is_none_or_instance(v2, SparseVector)):
        raise TypeError('v1 and v2 are not SparseVectors')
    if v1.size != v2.size:
        raise ValueError('v1 and v2 are not of same size')
    d1 = dict(zip(v1.indices, v1.values))
    d2 = dict(zip(v2.indices, v2.values))
    indices = sorted(list(set(v1.indices) & set(v2.indices)))
    values = [d1[i] * d2[i] for i in indices]
    return Vectors.sparse(v1.size, indices, values)

예제 #30

0

파일 보기

 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))

예제 #31

0

파일 보기

파일: test_feature.py 프로젝트: Brett-A/spark

 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))

예제 #32

0

파일 보기

def sparse_vector_add(v1, v2):
    if not (is_none_or_instance(v1, SparseVector) and
            is_none_or_instance(v2, SparseVector)):
        raise TypeError('v1 and v2 are not SparseVectors')
    if v1.size != v2.size:
        raise ValueError('v1 and v2 are not of same size')
    d1 = dict(zip(v1.indices, v1.values))
    d2 = dict(zip(v2.indices, v2.values))
    zero = NP.float64(0)
    indices = sorted(list(set(v1.indices) | set(v2.indices)))
    values = [d1.get(i, zero) + d2.get(i, zero) for i in indices]
    return Vectors.sparse(v1.size, indices, values)

예제 #33

0

파일 보기

 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]), ),
             (Vectors.sparse(2, {
                 0: 0.3,
                 1: 0.4
             }), ), (Vectors.sparse(2, {
                 0: 0.5,
                 1: 0.6
             }), )]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(
         isinstance(pd.features[0], csr_matrix),
         "Expected pd.features[0] to be csr_matrix but found: %s" %
         type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0, 0], 0.1)
     self.assertEqual(pd.features[0][0, 1], 0.2)

예제 #34

0

파일 보기

def load_sparse_data():
    tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"]
    # train the model on the new data for a few epochs
    datafiles = [
        file for file in os.listdir(tempDataLocalPath)
        if file.endswith(".json")
    ]
    row_n = []
    col_n = []
    data_n = []
    y = []
    feature_size = 0
    row_index = 0
    for file in datafiles:
        with open(tempDataLocalPath + "/" + file) as f:
            for line in f.readlines():
                obj = json.loads(line)
                fc = obj[featureCol]
                if "size" not in fc and "type" not in fc:
                    feature_size = len(fc)
                    dic = [(i, a) for i, a in enumerate(fc)]
                    sv = SparseVector(len(fc), dic)
                elif "size" not in fc and "type" in fc and fc["type"] == 1:
                    values = fc["values"]
                    feature_size = len(values)
                    dic = [(i, a) for i, a in enumerate(values)]
                    sv = SparseVector(len(values), dic)

                else:
                    feature_size = fc["size"]
                    sv = Vectors.sparse(fc["size"],
                                        list(zip(fc["indices"], fc["values"])))

                for c in sv.indices:
                    row_n.append(row_index)
                    col_n.append(c)
                    data_n.append(sv.values[list(sv.indices).index(c)])

                if type(obj[labelCol]) is list:
                    y.append(np.array(obj[labelCol]).argmax())
                else:
                    y.append(obj[labelCol])
                row_index += 1
                if row_index % 10000 == 0:
                    print("processing lines: %s, values: %s" %
                          (str(row_index), str(len(row_n))))
                    # sys.stdout.flush()
    print("X matrix : %s %s  row_n:%s col_n:%s classNum:%s" %
          (row_index, feature_size, len(row_n), len(col_n), ",".join(
              [str(i) for i in list(set(y))])))
    sys.stdout.flush()
    return sp.csc_matrix((data_n, (row_n, col_n)),
                         shape=(row_index, feature_size)), y

예제 #35

0

파일 보기

파일: bag_of_words.py 프로젝트: gurpreetbajwa/Sentiment-Analysis

def get_sparseVector(x):
    ids=[]
    for j in x:
        if j in cluster.keys():
            ids.append(cluster[j])

    bag_words = {}
    for i in ids:
        bag_words[i]=(float(ids.count(i))/len(ids))
     # Create a SparseVector
    sv = Vectors.sparse(2000, bag_words)
    return sv

예제 #36

0

파일 보기

파일: feature.py 프로젝트: Fantasticer/translate-spark

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency vectors,
        or transform the RDD of document to RDD of term frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

예제 #37

0

파일 보기

 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")

예제 #38

0

파일 보기

파일: parth_vaghani_Kmeans.py 프로젝트: parthvaghani1995/Data-Mining

def add(v1, v2):
    assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
    assert v1.size == v2.size
    indices = set(v1.indices).union(set(v2.indices))
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    values = {
        i: v1d.get(i, zero) + v2d.get(i, zero)
        for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero
    }

    return Vectors.sparse(v1.size, values)

예제 #39

0

파일 보기

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency vectors,
        or transform the RDD of document to RDD of term frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

예제 #40

0

파일 보기

파일: tests.py 프로젝트: A7mech/spark

 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")

예제 #41

0

파일 보기

def make_dataFrame(my_dict1, my_dict2, most_list):
    fre_keys = my_dict1.keys()
    index_list = list()

    for i in range(len(most_list)):
        for j in range(len(fre_keys)):
            if most_list[i] == fre_keys[j]:
                index_list.append(i)
            
    fre_keys2 = my_dict2.keys()
    index_list2 = list()

    for i in range(len(most_list)):
        for j in range(len(fre_keys2)):
            if most_list[i] == fre_keys2[j]:
                index_list2.append(i)

    #print index_list
    #print index_list2

    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.linalg import Vectors

    tmp_dict = dict()
    for i in range(len(index_list)):
        tmp_dict[index_list[i]] = my_dict1[most_list[index_list[i]]]

    tmp_dict2 = dict()
    for i in range(len(index_list2)):
        tmp_dict2[index_list2[i]] = my_dict2[most_list[index_list2[i]]]

    p = [
        LabeledPoint(1, Vectors.sparse(20, tmp_dict )),
        LabeledPoint(0, Vectors.sparse(20, tmp_dict2 ))
    ]

    trainDf = spark.createDataFrame(p)
    trainDf.show()

예제 #42

0

파일 보기

 def test_ml_mllib_vector_conversion(self):
     # to ml
     # dense
     mllibDV = Vectors.dense([1, 2, 3])
     mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
     mlDV2 = mllibDV.asML()
     self.assertEqual(mlDV2, mlDV1)
     # sparse
     mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV2 = mllibSV.asML()
     self.assertEqual(mlSV2, mlSV1)
     # from ml
     # dense
     mllibDV1 = Vectors.dense([1, 2, 3])
     mlDV = newlinalg.Vectors.dense([1, 2, 3])
     mllibDV2 = Vectors.fromML(mlDV)
     self.assertEqual(mllibDV1, mllibDV2)
     # sparse
     mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mllibSV2 = Vectors.fromML(mlSV)
     self.assertEqual(mllibSV1, mllibSV2)

예제 #43

0

파일 보기

def convertToSparse(a):

    itemNo=[]
    item=[]
    j=0

    for i in a:
       j=j+1
       if j%2==0:
           item.append(int(i))
       else:
           itemNo.append(int(i)+7)

    return Vectors.sparse(43,itemNo,item)

예제 #44

0

파일 보기

 def _dot1(self, S):
     if self.size != S.numRows():
         raise Exception(
             f"size mismatch ({self.size},) and ({S.numRows()},{S.numCols()})"
         )
     size = self.size
     v = self.rdd.map(lambda entry: (1, entry)).groupByKey().map(lambda x: (
         x[0],
         Vectors.sparse(
             size, *list(zip(*sorted(x[1].data, key=lambda y: y[0]))))))
     a = self._pre_dot(S)
     c = v.cartesian(a).map(lambda x: (x[1][0], float(x[0][1].dot(x[1][1])))
                            ).filter(lambda entry: entry[1] != 0.0)
     return SparseDistributedVector(c, S.numCols())

예제 #45

0

파일 보기

파일: PCAUtils.py 프로젝트: caunion/sparkFA

 def sparseVectorTimesMatrixAlloc(sparseVector, matrix):
     matrixCols = matrix.shape[1]
     tupleList = {}
     for col in range(matrixCols):
         indices = sparseVector.indices
         dotRes = 0
         for index in indices:
             value = sparseVector[index]
             dotRes += matrix[index, col] * value
         if ( abs(dotRes) > PCAUtils.zero):
             tupleList[col] = dotRes
     # alloc space for the sparse vector
     sparseRet = Vectors.sparse(matrixCols, tupleList)
     return sparseRet

예제 #46

0

파일 보기

파일: matrixoperation.py 프로젝트: xinwang26/pyspark

def DFtoIndexedMatrix(df, quantvars, idcol):
    '''
	convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors
	'''
    df = VectorAssembler(
        inputCols=quantvars, outputCol="features"
    ).transform(df).select(
        [idcol, "features"]
    )  #vector assembler turn it automatically to sparse matrix, so next line should be fine
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row.features.size, row.features.indices, row.
                            features.values)))
    return IndexedRowMatrix(df)

예제 #47

0

파일 보기

파일: tests.py 프로젝트: yoavfreund/spark

 def test_linear_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame(
         [(1.0, 2.0, Vectors.dense(1.0)),
          (0.0, 2.0, Vectors.sparse(1, [], []))],
         ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5,
                           regParam=0.0,
                           solver="normal",
                           weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(
         isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     devResiduals = s.devianceResiduals
     self.assertTrue(
         isinstance(devResiduals, list)
         and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(
         isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(
         isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(
         isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance,
                            s.explainedVariance)

예제 #48

0

파일 보기

파일: pyspark_lda_pipeline.py 프로젝트: shredmore/twitter_fakestnews

def get_sparse_vectors(documents):
    # get unique word list
    WordList = const_unique_word_list(documents)
    # convert to counter object
    documents_word_counter = documents.map(lambda words: sorted(
        Counter([WordList.index(word) for word in words
                 if word in WordList]).items(),
        key=lambda pair: pair[0],
        reverse=False))
    # convert to sparse vector
    documents_sparse_vectors = documents_word_counter.map(
        lambda counter: Vectors.sparse(len(WordList),
                                       tuple([pair[0] for pair in counter]),
                                       tuple([pair[1] for pair in counter])))
    return documents_sparse_vectors

예제 #49

0

파일 보기

def docTopics(filepath, topicMatrix):
    # for each topic
    # sum probablity of words in corpus
    # normalize so that probility of topics sum to 1

    n_vcb = topicMatrix.shape[0]
    data = sc.textFile(filepath)
    parsedData = data.map(lambda line: line.strip().split(' ')).map(
        lambda x: (int(x[0]) - 1,
                   (int(x[1]) - 1, float(x[2])))).groupByKey().mapValues(list)
    corpus = parsedData.map(
        lambda x:
        [x[0], normalize(Vectors.sparse(n_vcb, x[1]).dot(topicMatrix))])

    return corpus.collect()

예제 #50

0

파일 보기

    def _dot2(self,v):
        if self.numCols() != v.size:
            raise Exception(f"size mismatch ({self.numRows()},{self.numCols()}) and ({v.size},)")
        size = v.size
        sv = v.rdd.map(lambda entry: (1, entry)).groupByKey().map(
            lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda y: y[0])))))
        )
        a = self._pre_dot(self, size = size)

        c = sv.cartesian(a).map(
            lambda x: (x[1][0], float(x[0][1].dot(x[1][1])))
        ).filter(
            lambda entry: entry[1] != 0.0
        )
        return sdv.SparseDistributedVector(c, self.numRows())

예제 #51

0

파일 보기

    def test_apply_binary_term_freqs(self):
        sqlContext = SQLContext(self.sc)

        df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 100
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.sparse(n, {(ord("a") % n): 1.0,
                                      (ord("b") % n): 1.0,
                                      (ord("c") % n): 1.0}).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))

예제 #52

0

파일 보기

파일: kMeans.py 프로젝트: nitinsaroha/kmeans-clustering

def add(v1, v2):
    """Add two sparse vectors
    >>> v1 = Vectors.sparse(3, {0: 1.0, 2: 1.0})
    >>> v2 = Vectors.sparse(3, {1: 1.0})
    >>> add(v1, v2)
    SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0})
    """
    assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
    assert v1.size == v2.size
    indices = set(v1.indices).union(set(v2.indices))
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    values = {i: v1d.get(i, zero) + v2d.get(i, zero) for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero}

    return Vectors.sparse(v1.size, values)

예제 #53

0

파일 보기

파일: train.py 프로젝트: allwefantasy/streamingpro

def load_sparse_data():
    tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"]
    # train the model on the new data for a few epochs
    datafiles = [file for file in os.listdir(tempDataLocalPath) if file.endswith(".json")]
    row_n = []
    col_n = []
    data_n = []
    y = []
    feature_size = 0
    row_index = 0
    for file in datafiles:
        with open(tempDataLocalPath + "/" + file) as f:
            for line in f.readlines():
                obj = json.loads(line)
                fc = obj[featureCol]
                if "size" not in fc and "type" not in fc:
                    feature_size = len(fc)
                    dic = [(i, a) for i, a in enumerate(fc)]
                    sv = SparseVector(len(fc), dic)
                elif "size" not in fc and "type" in fc and fc["type"] == 1:
                    values = fc["values"]
                    feature_size = len(values)
                    dic = [(i, a) for i, a in enumerate(values)]
                    sv = SparseVector(len(values), dic)

                else:
                    feature_size = fc["size"]
                    sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"])))

                for c in sv.indices:
                    row_n.append(row_index)
                    col_n.append(c)
                    data_n.append(sv.values[list(sv.indices).index(c)])

                if type(obj[labelCol]) is list:
                    y.append(np.array(obj[labelCol]).argmax())
                else:
                    y.append(obj[labelCol])
                row_index += 1
                if row_index % 10000 == 0:
                    print("processing lines: %s, values: %s" % (str(row_index), str(len(row_n))))
                    # sys.stdout.flush()
    print("X matrix : %s %s  row_n:%s col_n:%s classNum:%s" % (
        row_index, feature_size, len(row_n), len(col_n), ",".join([str(i) for i in list(set(y))])))
    sys.stdout.flush()
    return sp.csc_matrix((data_n, (row_n, col_n)), shape=(row_index, feature_size)), y

예제 #54

0

파일 보기

파일: engine.py 프로젝트: camlinke/movie-recommender

def get_top_movies_for_user(user_ratings=fake_user_ratings, ratings_rdd=ratingsRDD):
    movies_length = ratings_rdd.map(lambda x: x[1]).max() + 1
    user_ids_with_ratings_rdd = (ratings_rdd
                                 .map(lambda (user_id, movie_id, rating): (user_id, [(movie_id, rating)]))
                                 .reduceByKey(lambda a, b: a + b)
                                 .filter(lambda x: len(x[1]) > 25)
                                 .map(lambda x: (x[0], Vectors.sparse(movies_length, x[1]))))
    user_seen_movies_list = [x[0] for x in user_ratings]
    most_similar_for_user_rdd = create_most_similar_for_user_rdd(user_ratings, user_ids_with_ratings_rdd, movies_length)
    similar_users_and_similarity_rdd = create_similar_users_and_similarity_rdd(most_similar_for_user_rdd)
    top_movies_for_user = (user_ids_with_ratings_rdd
                           .join(similar_users_and_similarity_rdd)
                           .flatMap(lambda x: create_id_rating_tuples(x[1][1], x[1][0]))
                           .filter(lambda x: x[0] not in user_seen_movies_list)
                           .reduceByKey(lambda a, b: max(a, b))
                           .takeOrdered(100, lambda x: -x[1]))
    return top_movies_for_user

예제 #55

0

파일 보기

파일: Encoding.py 프로젝트: yingcuhk/SparkPractice

	def add_hashed_features(self,df,num_hash_buckets = 2 ** 15):

		def hash_function(raw_feats, num_buckets, print_mapping=False):
			"""Calculate a feature dictionary for an observation's features based on hashing.

			Note:
				Use print_mapping=True for debug purposes and to better understand how the hashing works.

			Args:
				raw_feats (list of (int, str)): A list of features for an observation.  Represented as
					(featureID, value) tuples.
				num_buckets (int): Number of buckets to use as features.
				print_mapping (bool, optional): If true, the mappings of featureString to index will be
					printed.
			Returns:
				dict of int to float:  The keys will be integers which represent the buckets that the
					features have been hashed to.  The value for a given key will contain the count of the
					(featureID, value) tuples that have hashed to that key.
			"""
			mapping = {category + ':' + str(ind):
						   int(int(hashlib.md5(category + ':' + str(ind)).hexdigest(), 16) % num_buckets)
					   for ind, category in raw_feats}
			if (print_mapping): print mapping

			def map_update(l, r):
				l[r] += 1.0
				return l

			sparse_features = reduce(map_update, mapping.values(), defaultdict(float))
			return dict(sparse_features)
		"""Return a DataFrame with labels and hashed features.
		Note:
			Make sure to cache the DataFrame that you are returning.

		Args:
			df (DataFrame with 'tuples' column): A DataFrame containing the tuples to be hashed.

		Returns:
			DataFrame: A DataFrame with a 'label' column and a 'features' column that contains a
				SparseVector of hashed features.
		"""
		tuples_to_hash_features_udf = udf(lambda x: Vectors.sparse(num_hash_buckets, hash_function(x, num_hash_buckets)), VectorUDT())

		return df.select(df.label,tuples_to_hash_features_udf(df.features).alias("features")).cache()

예제 #56

0

파일 보기

파일: clustering_spark.py 프로젝트: ramtinms/Benchmark_Bigdata_ML

def change_to_sparse(line):
    keys = []
    values = []
    #logger = logging.getLogger("py4j")
    #logger.setLevel(logging.INFO)
    #logger.addHandler(logging.StreamHandler())
    #logger.info("<><<><><><><><><>")

    temp = line.split("{")[1]
    temp = re.sub('}','',temp)
    for item in temp.split(',')[1:]:
        index = int(item.split(':')[0])
        value = float(item.split(':')[1])
        keys.append(index)
        values.append(value)
    #print keys
    #print values
    #logger.info(max(keys))
    return Vectors.sparse(Vocab_size, sorted(keys),values)

예제 #57

0

파일 보기

파일: GMMclustering.py 프로젝트: FlytxtRnD/GMM

    def log_multivariate_normal_density_diag_Nd(self, x):
        """
        Compute Gaussian log-density at x for a diagonal model

        """

        n_features = x.size

        if self.isSparse == 1:
            t = Vectors.sparse(x.size, x.indices, x.values**2).dot((1/self.covarBc.value).T)

        else:
            t = np.dot(x**2, (1/self.covarBc.value).T)

        lpr = -0.5 * (n_features*np.log(2*np.pi) + np.sum(np.log(self.covarBc.value), 1) +
                      np.sum((self.meansBc.value ** 2) / self.covarBc.value, 1)
                      - 2 * x.dot((self.meansBc.value/self.covarBc.value).T) + t)

        return lpr

예제 #58

0

파일 보기

파일: tests.py 프로젝트: A7mech/spark

 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)