Python SparseVector示例，pyspark.mllib.linalg.SparseVector Python示例

示例#1

0

显示文件

文件： click_through_rate_prediction.py 项目： tcoatale/Click_through_rate_prediction

def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    label = point.split(",")[0]

    unkeyed_features = point.split(",")[1:]
    
    index = 0
    keyed_features = []
    for feature in unkeyed_features:
      keyed_features.append((index, feature))
      index += 1
    
    features = hashFunction(numBuckets, keyed_features, True)
    features = SparseVector(numBuckets, sorted(features.keys()), features.values())
    
    return LabeledPoint(label, features)

示例#2

0

显示文件

文件： test_linalg.py 项目： drewrobb/spark

 def test_squared_distance(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 3
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
     self.assertEqual(15.0, dv.squared_distance(lil))
     self.assertEqual(15.0, sv.squared_distance(lil))

示例#3

0

显示文件

文件： tests.py 项目： vidur89/spark

 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))

示例#4

0

显示文件

文件： test_linalg.py 项目： drewrobb/spark

    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)

示例#5

0

显示文件

文件： learn.py 项目： Sapphirine/LeaguePredictor

	def f(champ):
		i = 0
		newVects = []

		while champ + i * (max(champions) + 1) < len(partialVect):
			newVect = SparseVector(len(partialVect), partialVect.indices, partialVect.values)
			newVect.indices = numpy.append(newVect.indices, [champ + i * (max(champions) + 1)])
			newVect.values = numpy.append(newVect.values, [sign])
			newVects.append(newVect)
			i += 1

		return newVects

示例#6

0

显示文件

文件： test_linalg.py 项目： drewrobb/spark

 def test_parse_vector(self):
     a = DenseVector([])
     self.assertEqual(str(a), '[]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = DenseVector([3, 4, 6, 7])
     self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(4, [], [])
     self.assertEqual(str(a), '(4,[],[])')
     self.assertEqual(SparseVector.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)

示例#7

0

显示文件

文件： test_linalg.py 项目： drewrobb/spark

 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))

示例#8

0

显示文件

文件： tests.py 项目： LakeCarrot/EC2_Initializing

 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), "[3.0,4.0,6.0,7.0]")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), "(4,[0,2],[3.0,4.0])")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(" (10, [0,1 ],[ 4.0,5.0] )"), a)

示例#9

0

显示文件

    from pyspark.mllib.feature import HashingTF, IDF

    hashingTF = HashingTF()
    tf = hashingTF.transform(Positive_Reviews_body)

    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    tfidf_T = tfidf \
        .zipWithIndex() \
        .flatMap(explode) \
        .map(lambda x: (x[1], [x[0], x[2]])) \
        .reduceByKey(lambda x, y: np.vstack([x, y])) \
        .map(lambda x: (x[0], np.array(x[1]).reshape(-1, 2))) \
        .map(lambda x: (x[0], x[1][x[1][:, 0].argsort()])) \
        .map(lambda x: (x[0], SparseVector(num_Postive_Sentence, x[1][:, 0], x[1][:, 1])))

    cosine_similarity = IndexedRowMatrix(tfidf_T).columnSimilarities()

    sim_matrix_full = cosine_similarity.entries \
        .flatMap(lambda x: ((x.j, x.i, x.value), (x.i, x.j, x.value)))

    avg_dist_each = sim_matrix_full \
        .map(lambda x: (x[0], (1, 1-x[2]))) \
        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
        .map(lambda x: (x[0], (x[1][1] + num_Postive_Sentence - x[1][0] - 1)/ (num_Postive_Sentence - 1)))

    avg_dist_each_vector = np.array(avg_dist_each.collect())
    avg_dist_overall = np.mean(avg_dist_each_vector)

    center_index = avg_dist_each_vector[avg_dist_each_vector[:,

示例#10

0

显示文件

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)

sparse_data = [
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
]

data = sc.parallelize(sparse_data)

model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10)
model.numTrees()

model.totalNumNodes()

model.predict(SparseVector(2, {1: 1.0}))

model.predict(SparseVector(2, {0: 1.0}))

rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
print(model.predict(rdd).collect())

model.save(sc, 'model')

示例#11

0

显示文件

from pyspark.ml.feature import OneHotEncoder

finalDF2.registerTempTable("dfData")
finalDF2 = spark.sql(
    "SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData"
)

encoder = OneHotEncoder(dropLast=False,
                        inputCol="project_index",
                        outputCol="project_Vec")
encoded = encoder.transform(finalDF2)

encoded.registerTempTable("dfData")
finalDF3 = spark.sql(
    "SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData")

from pyspark.mllib.linalg import SparseVector
import numpy as np
#824 should be revised according to your onehotcode result
RDD = finalDF3.rdd.map(lambda line: SparseVector(
    824, line["project_Vec"].indices.tolist() + [821, 822, 823], line[
        "project_Vec"].values.tolist() +
    [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache()

from pyspark.mllib.clustering import KMeans

clusters = KMeans.train(RDD,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode="k-means||")

示例#12

0

显示文件

文件： term_doc.py 项目： luogantt/pyspark-examples

'''

from pyspark.mllib.linalg import SparseVector
from collections import Counter

from pyspark import SparkContext

if __name__ == "__main__":

    sc = SparkContext('local', 'term_doc')
    corpus = sc.parallelize([
        "It is the east, and Juliet is the sun.", "A dish fit for the gods.",
        "Brevity is the soul of wit."
    ])

    tokens = corpus.map(lambda raw_text: raw_text.split()).cache()
    local_vocab_map = tokens.flatMap(
        lambda token: token).distinct().zipWithIndex().collectAsMap()

    vocab_map = sc.broadcast(local_vocab_map)
    vocab_size = sc.broadcast(len(local_vocab_map))

    term_document_matrix = tokens \
                         .map(Counter) \
                         .map(lambda counts: {vocab_map.value[token]: float(counts[token]) for token in counts}) \
                         .map(lambda index_counts: SparseVector(vocab_size.value, index_counts))

    for doc in term_document_matrix.collect():
        print doc

示例#13

0

显示文件

# coding=utf-8

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print(pos)
print(neg)

示例#14

0

显示文件

文件： transform_run.py 项目： nokia/kubeflow-pipelines

          for v in values:
            if v in vocab[col]:
              word_indices.append(start_index + vocab[col].index(v))
          for k, v in sorted(six.iteritems(Counter(word_indices))):
            feature_indices.append(k)
            feature_values.append(float(v))
        start_index += len(vocab[col])
      if col == target_col:
        label = vocab[col].index(col_value) if classification else col_value
    return {"label": label, "indices": feature_indices, "values": feature_values}
  
  return process_rows


process_row_fn = make_process_rows_fn(
    classification, args.target, text_columns, category_columns, number_columns, vocab, stats)

dfs = []
if args.train:
  dfTrain = spark.read.schema(schema).csv(args.train)
  dfs.append(("train", dfTrain))
if args.eval:
  dfEval = spark.read.schema(schema).csv(args.eval)
  dfs.append(("eval", dfEval))

for name, df in dfs:
  rdd = df.rdd.map(process_row_fn).map(
      lambda row: LabeledPoint(row["label"],
                               SparseVector(feature_size, row["indices"], row["values"])))
  MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))

示例#15

0

显示文件

文件： original.py 项目： lorenmh/coen281-project-old

def to_sparse_vector(t):
    return SparseVector(size, t)

示例#16

0

显示文件

文件： main_firstTry_cluster.py 项目： mathieu-lechine/sentiment_analysis_with_spark

def vectorizeUni(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryUni[w]] = 1
    return SparseVector(len(dictionaryUni), vector_dict)

示例#17

0

显示文件

文件： clusterindex_vec.py 项目： gitofsid/MyBigDataCode

def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))

示例#18

0

显示文件

文件： model_sell.py 项目： stubird/onlinePred

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
sqlc = SQLContext(sc)

print(StructField , "go go go")

#df = sqlc.read.csv("hdfs://hadoop1:9000/home/hadoop/test.csv",header=True)

sel = pd.read_csv("selldata/selldata")

df = sqlc.read.csv("selldata/selldata",header=True)
print(sel.columns)
print(df)

tran = df.rdd.map(lambda x:LabeledPoint(list(x)[9], SparseVector(7, [i for i in range(7)],list(x)[2:9])))

print(len(tran.collect()))


#tran = data.rdd.map(lambda x:LabeledPoint(list(x)[17], SparseVector(16, [i for i in range(16)],list(x)[1:17])))

#model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10)

#model.save(sc,"./gbdymodelonlionev1")
#
# a = StructType([
#     StructField("ID",StringType(),False),
#     StructField("cust_type", StringType(), True),
#     StructField("cust_level", IntegerType(), True),
#     StructField("ID",StringType(),False),

示例#19

0

显示文件

    terms = tags.split()

    # filter words that not exist in the vocabulary
    terms = [x for x in list(set(terms)) if x in list(set(vocabulary))]

    indices = list(map(lambda x: vocabulary.index(x), list(set(terms))))
    indices.sort()
    occurrences = list(
        map(lambda x: float(terms.count(vocabulary[x])), indices))

    return [len(vocabulary), indices, occurrences]


conf = SparkConf()
conf.setAppName("NaiveBaye")
conf.set('spark.driver.memory', '6g')
conf.set('spark.executor.memory', '6g')
conf.set('spark.cores.max', 156)

#load tags passed as parameter
tags = sys.argv[1]
bow = bow(tags)  #bag of words of that tags

sc = SparkContext(conf=conf)  # SparkContext

model = NaiveBayesModel.load(sc, "model")

result = model.predict(SparseVector(bow[0], bow[1], bow[2]))

print str(classValues[result])

示例#20

0

显示文件

文件： ps_training_testing_oct_Dec18.py 项目： stevezheng22/predictive_segment

def main():
    st_time = time.time()

    train_percentage = 0.67
    conf = (SparkConf().setMaster('local[*]').set(
        'spark.executor.memory',
        '4G').set('spark.driver.memory',
                  '45G').set('spark.driver.maxResultSize', '10G'))
    sc = SparkContext(conf=conf)
    if False:
        cid = 000000  # representing ps data
        # filename = 'ps_train.svm'
        # sc = SparkContext("local", "Simple App")
        # filename = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/models/2017-09-29/ps.51/training_set'
        filename = '../ps_data/ps_oct/training_set'
        # sc = SparkContext(conf=SparkConf().setAppName("ps_spark_grid")
        # conf = (SparkConf().set('spark.yarn.executor.memoryOverhead', '4096').set('spark.kryoserializer.buffer.max.mb', '2047').set('spark.driver.maxResultSize','2g'))

        data = sc.textFile(filename)
        # labels_sca = data.map(lambda x: int(x[0])) # int type
        labels_sca = data.map(lambda line: line.split(',')).map(
            lambda y: float(y[len(y) - 1]))
        nbr_samples = data.count()
        # nbr_samples = 10000
        l_sca = np.array(labels_sca.take(nbr_samples))
        #l, _ = fOnehot_encode(labels_sca.take(nbr_samples))
        l = np.column_stack([np.array(l_sca), 1 - np.array(l_sca)])

        # features = data.map(lambda x: x.split(' ')).map(lambda y: [int(y[i][-1]) for i in range(902)])
        features = data.map(lambda line: line.split(',')).map(
            lambda y: [float(y[i]) for i in range(len(y) - 1)])
        X = np.array(features.take(nbr_samples))
        nbr_feature = len(X[0])
        print('nbr of features: ' + str(nbr_feature))

        # data_train, _ = fSplitTrainAndTest(X, l, l_sca, train_percentage)
        data_train, data_test = fSplitTrainAndTest(X, l, l_sca,
                                                   train_percentage)

    ##### uncomment this if try using another testing set
    nbr_feature = 600
    # filename_test_new = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/xg/test_data/2017-09-20/ps.51/part-r-01088'
    filename_test_new = '../ps_data/part-r-01088'
    new_data_test = sc.textFile(filename_test_new)
    nbr_samples_test = new_data_test.count()
    # nbr_samples_test = 10000
    data2 = new_data_test.map(lambda line: line.split('\t')).map(
        lambda x: x[1])
    labels = data2.map(lambda x: float(x[0]))
    feature_str = data2.map(lambda x: x[2:])
    t2 = feature_str.map(lambda lines: lines.split(' '))
    features = t2.map(lambda x: DenseVector(
        SparseVector(nbr_feature,
                     {int(i.split(':')[0]): float(i.split(':')[1])
                      for i in x})))
    l_sca_test = np.array(labels.take(nbr_samples_test))
    l_test = np.column_stack([np.array(l_sca_test), 1 - np.array(l_sca_test)])
    X_test = np.array(features.take(nbr_samples_test))
    # data_test = Data(X_test, l_test, l_sca_test)

    data_train, data_test = fSplitTrainAndTest(X_test, l_test, l_sca_test,
                                               train_percentage)
    # # ####

    # data_train = Data(X, l, l_sca)
    n = len(data_train.X)  # total number of training samples
    d = len(data_train.X[0])  # number of features
    ll = len(data_train.labels[0])  #output dimension
    # print (n)
    # print (d)
    # print (ll)

    # Create the model
    x = tf.placeholder(tf.float32, [None, d])
    keep_prob = tf.placeholder(tf.float32)

    # if False:
    # 	y = deepnn(x, d, ll)
    # else:
    # y = deepnn_withBN(x, d, ll, 3, keep_prob)
    nbr_of_layers = 2
    nbr_layer1 = 250
    nbr_layer2 = 350
    epsilon = 1e-3

    x_drop = tf.nn.dropout(x, keep_prob)  # adding dropout in the input layer
    # x_drop = x # no dropout on input layer
    W1 = weight_variable([d, nbr_layer1])
    b1 = bias_variable([nbr_layer1])
    z1 = tf.matmul(x_drop, W1) + b1
    batch_mean1, batch_var1 = tf.nn.moments(z1, [0])
    z1_hat = (z1 - batch_mean1) / tf.sqrt(batch_var1 + epsilon)
    scale1 = tf.Variable(tf.ones([nbr_layer1]))
    beta1 = tf.Variable(tf.zeros([nbr_layer1]))
    #b1 = bias_variable([nbr_layer1])
    h1 = tf.nn.relu(scale1 * z1_hat + beta1)
    h1_drop = tf.nn.dropout(h1, keep_prob)
    if nbr_of_layers == 2:

        W2 = weight_variable([nbr_layer1, ll])
        b2 = bias_variable([ll])
        y = tf.matmul(h1_drop, W2) + b2
    #h1 = tf.nn.sigmoid(scale1*z1_hat + beta1)
    else:
        W2 = weight_variable([nbr_layer1, nbr_layer2])
        b2 = bias_variable([nbr_layer2])
        z2 = tf.matmul(h1_drop, W2) + b2
        batch_mean2, batch_var2 = tf.nn.moments(z2, [0])
        z2_hat = (z2 - batch_mean2) / tf.sqrt(batch_var2 + epsilon)
        scale2 = tf.Variable(tf.ones([nbr_layer2]))
        beta2 = tf.Variable(tf.zeros([nbr_layer2]))
        h2 = tf.nn.relu(scale2 * z2_hat + beta2)
        h2_drop = tf.nn.dropout(h2, keep_prob)
        #h2 = tf.nn.sigmoid(scale2*z2_hat + beta2)

        W3 = weight_variable([nbr_layer2, ll])
        b3 = bias_variable([ll])

        y = tf.matmul(h2_drop, W3) + b3

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, ll])

    tf.summary.histogram('W1', W1)
    tf.summary.histogram('W2', W2)
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    starter_learning_rate = 0.01
    global_step = tf.Variable(0, trainable=False)
    # train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
    learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                               global_step,
                                               decay_steps=5000,
                                               decay_rate=0.95,
                                               staircase=True,
                                               name=None)
    # train_step = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy, global_step = global_step)

    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        cross_entropy, global_step=global_step)
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    auc_ftrain = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                                tf.cast(tf.argmax(y_, 1), tf.float32))
    auc_ftest = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                               tf.cast(tf.argmax(y_, 1), tf.float32))
    softmaxed_logits = tf.nn.softmax(y)

    tf.local_variables_initializer().run()
    sess.run(tf.initialize_local_variables())
    tf.summary.scalar('cross_entropy', cross_entropy)
    tf.summary.scalar('accuracy', accuracy)
    tf.summary.scalar('auc_ftrain', auc_ftrain[0])
    tf.summary.scalar('auc_ftest', auc_ftest[0])

    train_writer = tf.summary.FileWriter("/tmp/histogram_example/train",
                                         sess.graph)
    test_writer = tf.summary.FileWriter("/tmp/histogram_example/test")

    # writer = tf.summary.FileWriter("/tmp/histogram_example")
    summaries = tf.summary.merge_all()
    # save
    st = np.array([])

    ac_train = np.array([])
    ca_train = np.array([])
    auc_train = np.array([])

    ac_test = np.array([])
    ca_test = np.array([])
    auc_test = np.array([])

    batch_size = 100

    for i in range(100):

        # train the whole epoch (first shuffle the data)
        idx = np.arange(0, n)
        np.random.shuffle(idx)
        X_shuffle = [data_train.X[k] for k in idx]
        labels_shuffle = [data_train.labels[k] for k in idx]

        for j in range(int(n / batch_size)):
            batch_xs = X_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            batch_ys = labels_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            sess.run(train_step,
                     feed_dict={
                         x: batch_xs,
                         y_: batch_ys,
                         keep_prob: 0.5
                     })

        # finish training, try on testing data
        if i % 10 is 0:
            print(i)

            soft_logits_train, summary_train, ca_train_i, ac_train_i, auc_train_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftrain
                ],
                feed_dict={
                    x: data_train.X,
                    y_: data_train.labels,
                    keep_prob: 1.0
                })

            soft_logits_test, summary_test, ca_test_i, ac_test_i, auc_test_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftest
                ],
                feed_dict={
                    x: data_test.X,
                    y_: data_test.labels,
                    keep_prob: 1.0
                })
            # [ca_test_i, ac_test_i,auc_test_i] = [0, 0, [0, 0]]
            #train_writer.add_summary(summary_train, i)
            #test_writer.add_summary(summary_test, i)

            # print (soft_logits_train)
            # print (data_train.labels)
            sk_auc_train = metrics.roc_auc_score(
                y_true=np.array(data_train.labels),
                y_score=np.array(soft_logits_train))
            sk_auc_test = metrics.roc_auc_score(
                y_true=np.array(data_test.labels),
                y_score=np.array(soft_logits_test))
            print('learning rate: ' + str(sess.run(learning_rate)))

            print('train cross entropy: ' + str(ca_train_i))
            print('test cross entropy: ' + str(ca_test_i))

            print('train accuracy: ' + str(ac_train_i))
            print('test accuracy: ' + str(ac_test_i))

            print('train auc: ' + str(auc_train_i[0]))
            print('test auc: ' + str(auc_test_i[0]))

            print('train sk auc: ' + str(sk_auc_train))
            print('test sk auc: ' + str(sk_auc_test))
            # print ('train auc sk' + str(auc_sk_train))
            # print ('test auc sk' + str(auc_sk_test))

        # ca_test, ac_test, auc_test = sess.run([cross_entropy, accuracy, auc], feed_dict={x: data_test.X, y_: data_test.labels, keep_prob: 1.0})
        # print ('test cross entropy: ' + str(ca_test))
        # print ('test accuracy: ' + str(ac_test))
        # print ('test auc: '+ str(auc_test[0]))
    sess.close()
    sc.stop()

    end_time = time.time()
    print('run time: ' + str(round(end_time - st_time)) + ' seconds')
    print('tensorboard --logdir=/tmp/histogram_example')
    return 1

示例#21

0

显示文件

import numpy as np
from scipy.sparse import csr_matrix
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

M = csr_matrix([[4, 1, 0], [4, 0, 3], [0, 0, 1]])
label = 0.0
point = LabeledPoint(label, SparseVector(3, [0, 2], [1.0, 3.0]))
 
textRDD = sc.textFile("README.md")
print textRDD.count()

示例#22

0

显示文件

文件： randomforest.py 项目： gitofsid/MyBigDataCode

def to_labeledpoint(line):
    line_spl = line.split(' :: ')
    return LabeledPoint(line_spl[0], SparseVector.parse(line_spl[1]))

示例#23

0

显示文件

 def change_labelPoint(rdd, total_feas):
     label_point = LabeledPoint(
         rdd['label'], SparseVector(total_feas, rdd['pos'], rdd['val']))
     return label_point

示例#24

0

显示文件

文件： main_firstTry_cluster.py 项目： mathieu-lechine/sentiment_analysis_with_spark

def vectorizeBi(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryBigrams[w]] = 1
    return SparseVector(len(dictionaryBigrams), vector_dict)

示例#25

0

显示文件

文件： Ex3a.0.py 项目： wel51x/Machine_Learning_and_Spark

indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'type', 'type_idx')

print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])
onehot = onehot.fit(kars)
kars = onehot.transform(kars)
kars.select('type', 'type_idx',
            'type_dummy').distinct().sort('type_idx').show()

print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0]))
print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0}))

spark.stop()

示例#26

0

显示文件

文件： KMeans_collaborative.py 项目： Sandy4321/Big-Data

def vectorize(ratings, numMovies):
    return ratings.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: SparseVector(numMovies, x))

示例#27

0

显示文件

 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))

示例#28

0

显示文件

dv1
#array([ 2.,  0.,  5.])

# Sparse vector uses integer indices and double values.

sv1 = Vectors.sparse(2, [0, 3], [5.0, 1.0])
sv1
#SparseVector(2, {0: 5.0, 3: 1.0})

# Labeled Point:  This can be dense or Sparse vector with a label used in supervised learning.
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Labeled point with a positive label and a dense feature vector
lp_pos = LabeledPoint(1.0, [4.0, 0.0, 2.0])
lp_pos
# LabeledPoint(1.0, [4.0,0.0,2.0])

# Labeled point with a negative label and a sparse feature vector
lp_neg = LabeledPoint(0.0, SparseVector(5, [1, 2], [3.0, 5.0]))
lp_neg
#LabeledPoint(0.0, (5,[1,2],[3.0,5.0]))

# Local Matrix:  This is a matrix with integer type indices and double type values.  This is also stored on single machine.
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

示例#29

0

显示文件

    LabeledPoint(6.0, [3.0, 4.0])
]  # 训练集
lrm = LinearRegressionWithSGD.train(sc.parallelize(data),
                                    iterations=100,
                                    initialWeights=np.array([1.0, 1.0]))
print(lrm.predict(np.array([2.0, 1.0])))  # 利用训练出的回归模型进行预测

import os, tempfile
from pyspark.mllib.regression import LinearRegressionModel
from pyspark.mllib.linalg import SparseVector

path = tempfile.mkdtemp()
lrm.save(sc, path)  # 将模型保存至外存
sameModel = LinearRegressionModel.load(sc, path)  # 读取模型
print(sameModel.predict(SparseVector(2, {
    0: 100.0,
    1: 150
})))  # 利用稀疏向量作为数据结构,返回单个预测值
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(SparseVector(2, {0: i, 1: j}))
print(sameModel.predict(sc.parallelize(test_set)).collect())  # 预测多值，返回一个RDD数据集
print(sameModel.weights)  # 返回参数

# -----------------岭回归------------------

from pyspark.mllib.regression import RidgeRegressionWithSGD

data = [
    LabeledPoint(1.0, [1.0, 1.0]),
    LabeledPoint(4.0, [1.0, 3.0]),

示例#30

0

显示文件

sampleOHEDictManual[(2, 'salmon')] = 6

# COMMAND ----------

# MAGIC %md #### ** (1b) Sparse vectors **
# MAGIC #### Data points can typically be represented with a small number of non-zero OHE features relative to the total number of features that occur in the dataset.  By leveraging this sparsity and using sparse vector representations of OHE data, we can reduce storage and computational burdens.  Below are a few sample vectors represented as dense numpy arrays.

# COMMAND ----------

import numpy as np
from pyspark.mllib.linalg import SparseVector

# COMMAND ----------

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [1, 3], [3., 4.])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [3], [1.])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)

# COMMAND ----------

# MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0.  For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`.

# COMMAND ----------

示例#31

0

显示文件

def vectorizeBi(row,dico):
    vector_dict={}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]]=1
    return (row.label,SparseVector(len(dico),vector_dict))

示例#32

0

显示文件

文件： topicExtractionSpark.py 项目： dineshdharme/SparkTestProject

        # print (Vectors.sparse(row[0][0],row[0][1],row[0][2]) )
        print(row)
        break

    exit()
    """

    # zipping the word with its probability distribution for that topic
    termsRDD = topicsRDD.map(lambda topic:
                             (zip(itemgetter(*topic[0])(vocablist), topic[1])))

    zippedRDD = topicsRDD.map(lambda topic: (zip(topic[0], topic[1])))

    # for Every topic, sparse vector of distribution over words
    docCalcs = zippedRDD.map(lambda topic: DenseVector(
        (SparseVector(vocabSize, topic)).toArray()))

    #schema = StructType([StructField("topicwordDistribution", Vector(), False)])

    #df = sqlContext.applySchema(docCalcs, schema)

    #docCalcs = docCalcs.map(lambda l: Row(l))

    docCalcs = docCalcs.zipWithIndex()

    #docCalcs = docCalcs.collect()

    docCalcs = sqlContext.createDataFrame(docCalcs,
                                          ['topicwordDistribution', 'topicId'])

    #print(type(docCalcs))

示例#33

0

显示文件

def get_training_vector(classification, term_list, classifications,
                        number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))

示例#34

0

显示文件

# Use a single-column SciPy csc_matrix as a sparse vector.
sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))





# Example 11-3
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))






# Example 11-5
from numpy import array
from pyspark.mllib.linalg import Vectors

# Create the dense vector <1.0, 2.0, 3.0> 
denseVec1 = array([1.0, 2.0, 3.0]) # NumPy arrays can be passed directly to MLlib
denseVec2 = Vectors.dense([1.0, 2.0, 3.0]) # .. or you can use the Vectors class

# Create the sparse vector <1.0, 0.0, 2.0, 0.0>; the methods for this take only

示例#35

0

显示文件

文件： test_linalg.py 项目： zyyjalyt/spark

 def test_sparse_vector_iteration(self):
     self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
     self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])

示例#36

0

显示文件

    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model
    output_dir = 'output/'
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('sameModel accuracy {}'.format(accuracy))

    from pyspark.mllib.linalg import SparseVector
    testsparsevector = SparseVector(692, [5, 6], [5.0, 6.0])

    print(sameModel.predict(testsparsevector))

    # $example off$
    sc.stop()

示例#37

0

显示文件

文件： ClickThroughPrediction.py 项目： samkujovich/SparkExperience

Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')],
                        'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
                        "incorrect value for sampleOHEDictManual[(2,'mouse')]")
Test.assertEqualsHashed(sampleOHEDictManual[(2,'salmon')],
                        'c1dfd96eea8cc2b62785275bca38ac261256e278',
                        "incorrect value for sampleOHEDictManual[(2,'salmon')]")
Test.assertEquals(len(sampleOHEDictManual.keys()), 7,
                  'incorrect number of keys in sampleOHEDictManual')


# ** Sparse vectors **
import numpy as np
from pyspark.mllib.linalg import SparseVector

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)


# TEST Sparse Vectors
Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(aDense.dot(w) == aSparse.dot(w),

示例#38

0

显示文件

文件： test_linalg.py 项目： MitchellTesla/spark

class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize(
            [LabeledPoint(1.0, self.dv1),
             LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))

    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize(
            [IndexedRow(0, [1, 2, 3]),
             IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)

示例#39

0

显示文件

文件： tf_idf_amazon_linearreg.py 项目： gitofsid/MyBigDataCode

def normalized_labeledpoint(line,nor):
	line_spl = line.split(' :: ')
	return LabeledPoint(line_spl[0], nor.transform(SparseVector.parse(line_spl[1])))

示例#40

0

显示文件

def parseVector(line):
    _,indices_tuple_ls = line.split('\t')
    indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list.
    return SparseVector(TOTAL_DOCS,indices_tuple_ls)

示例#41

0

显示文件

# Created by Raju Kumar Mishra
# Book PySpark Recipes
# Chapter 9
# Recipe 9-2.  Create a Sparse Vector.
# Run following PySpark code lines, line by line in PySpark shell

from pyspark.mllib.linalg import SparseVector
sparseDataList = [1.0, 3.2]
sparseDataVector = SparseVector(8, [0, 7], sparseDataList)
sparseDataVector
sparseDataVector[1]
sparseDataVector[7]
sparseDataVector.numNonzeros()
sparseDataList1 = [3.0, 1.4, 2.5, 1.2]
sparseDataVector1 = SparseVector(8, [0, 3, 4, 6], sparseDataList1)
squaredDistance = sparseDataVector.squared_distance(sparseDataVector1)
squaredDistance

示例#42

0

显示文件

文件： cooccur_predict.py 项目： zsy2504/Break12306Captcha

def add_sparse_vector(vec1, vec2):
    t = vec1.toArray() + vec2.toArray()
    idx, val = get_sparse_index(t)
    return SparseVector(NUMBER_OF_CATEGORY, idx, val)