Python SparseVector.parse примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark.mllib.linalg

Класс/Тип: SparseVector

Метод/Функция: parse

Примеров на hotexamples.com: 9

Python SparseVector.parse - 9 примеров найдено. Это лучшие примеры Python кода для pyspark.mllib.linalg.SparseVector.parse, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

SparseVector(30)

dot(7)

parse(5)

numNonzeros(2)

squared_distance(2)

values(2)

indices(1)

keys(1)

norm(1)

Пример #1

Показать файл

 def test_parse_vector(self):
     a = DenseVector([])
     self.assertEqual(str(a), '[]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = DenseVector([3, 4, 6, 7])
     self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(4, [], [])
     self.assertEqual(str(a), '(4,[],[])')
     self.assertEqual(SparseVector.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)

Пример #2

Показать файл

Файл: tests.py Проект: LakeCarrot/EC2_Initializing

 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), "[3.0,4.0,6.0,7.0]")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), "(4,[0,2],[3.0,4.0])")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(" (10, [0,1 ],[ 4.0,5.0] )"), a)

Пример #3

Показать файл

def cosineSimilarity(a, b):
    aMagnitude = math.sqrt(float(sum([aVal**2 for aVal in a.values])))
    bMagnitude = math.sqrt(float(sum([bVal**2 for bVal in b.values])))
    a2 = SparseVector.parse(str(a))
    resultNumerator = a2.dot(b)
    resultDenominator = aMagnitude * bMagnitude
    if resultDenominator == 0:
        return 0
    return resultNumerator / resultDenominator

Пример #4

Показать файл

def cosine_pre_process(line):
    length_matches = len(line[1])
    i = 0
    j = 0
    s1 = SparseVector(1, [0], [1])
    s2 = SparseVector(1, [0], [1])
    for i in xrange(length_matches - 1):
        j = i

        while (j < length_matches - 1):
            j = j + 1
            sf = s1.parse(line[1][i][1])
            ss = (s2.parse(line[1][j][1]))
            dotp = sf.dot(ss)
            rss = np.sqrt(sum(np.square(sf.values))) * np.sqrt(
                sum(np.square(ss.values)))
            if dotp / rss > .60:

                if line[1][i][0] < line[1][j][0]:
                    yield line[1][i][0], line[1][j][0]
                else:
                    yield line[1][j][0], line[1][i][0]

Пример #5

Показать файл

Файл: meteos-script-1.6.0.py Проект: ncarkaci/meteos

    def _parse_to_libsvm(self, param):

        index_l = []
        value_l = []

        param_l = param.split(' ')
        param_len = str(len(param_l) * 2)

        for p in param_l:
            index_l.append(str(int(p.split(':')[0]) - 1))
            value_l.append(p.split(':')[1])

        index = ','.join(index_l)
        value = ','.join(value_l)

        parsed_str = '(' + param_len + ', [' + index + '],[' + value + '])'

        return SparseVector.parse(parsed_str)

Пример #6

Показать файл

Файл: clusterindex_vec.py Проект: gitofsid/MyBigDataCode

def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))

Пример #7

Показать файл

Файл: randomforest.py Проект: gitofsid/MyBigDataCode

def to_labeledpoint(line):
    line_spl = line.split(' :: ')
    return LabeledPoint(line_spl[0], SparseVector.parse(line_spl[1]))

Пример #8

Показать файл

Файл: tf_idf_amazon_linearreg.py Проект: gitofsid/MyBigDataCode

def normalized_labeledpoint(line,nor):
	line_spl = line.split(' :: ')
	return LabeledPoint(line_spl[0], nor.transform(SparseVector.parse(line_spl[1])))

Пример #9

Показать файл

# combined RDD is of [(4, ((2, 2.0), 3.4))] form
normalizedRatingRDD = combinedRDD.map(lambda (x, y):
                                      (x, (y[0][0], y[0][1] - y[1])))
sparseRatingRDD = normalizedRatingRDD.groupByKey().map(
    lambda (x, y): (x, Vectors.sparse(numUsers, y)))

## Step 5 - Perform Recommendation
for i in range(0, 10):
    ## 1) select a random movie, and associated rating vector

    randomMovieTuple = sparseRatingRDD.takeSample(
        False, 1)[0]  # tuple ( movieID, sparseVector )

    randMovieId = randomMovieTuple[0]  # extract movieId
    randMovieVector = SparseVector.parse(str(
        randomMovieTuple[1]))  # SparseVector associated with that movie

    # From this movie vector, we will randomly select a userId and set their rating to zero.
    # The idea would be to try to predict that rating and see how close we come to the actual value
    predVecValues = randMovieVector.values
    predVecIndices = randMovieVector.indices
    index = random.randint(0, len(predVecValues) - 1)
    predVecValues[index] = 0  # set rating to zero
    randUserId = predVecIndices[index]

    ## 2) compute cosine simularity with "randMovieVector" and each vector in RDD
    # result is a RDD of (cosSimValue, movieId)
    result = sparseRatingRDD.map(lambda v : (v[0], cosineSimilarity(v[1], randMovieVector)))\
              .map(lambda x: (x[1], x[0]))\
              .sortByKey(ascending=False)