示例#1
0
'''

from pyspark.mllib.linalg import SparseVector
from collections import Counter

from pyspark import SparkContext

if __name__ == "__main__":

    sc = SparkContext('local', 'term_doc')
    corpus = sc.parallelize([
        "It is the east, and Juliet is the sun.", "A dish fit for the gods.",
        "Brevity is the soul of wit."
    ])

    tokens = corpus.map(lambda raw_text: raw_text.split()).cache()
    local_vocab_map = tokens.flatMap(
        lambda token: token).distinct().zipWithIndex().collectAsMap()

    vocab_map = sc.broadcast(local_vocab_map)
    vocab_size = sc.broadcast(len(local_vocab_map))

    term_document_matrix = tokens \
                         .map(Counter) \
                         .map(lambda counts: {vocab_map.value[token]: float(counts[token]) for token in counts}) \
                         .map(lambda index_counts: SparseVector(vocab_size.value, index_counts))

    for doc in term_document_matrix.collect():
        print doc
# filter out those records that don't have simialr items
item_item_matrix_haveRatings = item_item_matrix.filter(lambda x: ',' in x)
item_item_matrix = (item_item_matrix_haveRatings.map(
    lambda x: parser_iiMatrix_and_compute_SparseMatrix(
        x, broadcast_Mapping_item.value, broadcast_Mapping_user.value)))


def sparseAdd(sv1, sv2, length):
    from pyspark.mllib.linalg import Vectors
    combinedV = sv1.toArray() + sv2.toArray()
    nonzeroes = combinedV.nonzero()[0]
    return Vectors.sparse(length, nonzeroes, combinedV[nonzeroes])


# Test sparseAdd function
o = SparseVector(2241, [771, 806, 1209, 1574], [1.0, 1.0, 1.0, 1.0])
k = SparseVector(2241, [305, 1253, 1254], [1.0, 1.0, 1.0])
Test.assertEquals(SparseVector(2241, [305, 1253, 1254], [2.0, 2.0, 2.0]),
                  sparseAdd(k, k, k.size), 'sparseAdd function malfunc')
Test.assertEquals(
    SparseVector(2241, [771, 806, 1209, 1574], [2.0, 2.0, 2.0, 2.0]),
    sparseAdd(o, o, o.size), 'sparseAdd function malfunc')
Test.assertEquals(
    SparseVector(2241, [305, 771, 806, 1209, 1253, 1254, 1574],
                 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), sparseAdd(o, k, o.size),
    'sparseAdd function malfunc')


def matrixMultiplication(sv, item_item_matrix):
    indices = sv.indices
    #     return item_item_matrix.filter(lambda (x,y):x in indices).map(lambda (x,y):y).reduce(lambda x,y:sparseAdd(x,y,y.size))
示例#3
0
          for v in values:
            if v in vocab[col]:
              word_indices.append(start_index + vocab[col].index(v))
          for k, v in sorted(six.iteritems(Counter(word_indices))):
            feature_indices.append(k)
            feature_values.append(float(v))
        start_index += len(vocab[col])
      if col == target_col:
        label = vocab[col].index(col_value) if classification else col_value
    return {"label": label, "indices": feature_indices, "values": feature_values}
  
  return process_rows


process_row_fn = make_process_rows_fn(
    classification, args.target, text_columns, category_columns, number_columns, vocab, stats)

dfs = []
if args.train:
  dfTrain = spark.read.schema(schema).csv(args.train)
  dfs.append(("train", dfTrain))
if args.eval:
  dfEval = spark.read.schema(schema).csv(args.eval)
  dfs.append(("eval", dfEval))

for name, df in dfs:
  rdd = df.rdd.map(process_row_fn).map(
      lambda row: LabeledPoint(row["label"],
                               SparseVector(feature_size, row["indices"], row["values"])))
  MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))
示例#4
0
# coding=utf-8

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print(pos)
print(neg)
示例#5
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize(
            [LabeledPoint(1.0, self.dv1),
             LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))

    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize(
            [IndexedRow(0, [1, 2, 3]),
             IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'type', 'type_idx')

print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])
onehot = onehot.fit(kars)
kars = onehot.transform(kars)
kars.select('type', 'type_idx',
            'type_dummy').distinct().sort('type_idx').show()

print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0]))
print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0}))

spark.stop()
示例#7
0
import numpy as np
from scipy.sparse import csr_matrix
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

M = csr_matrix([[4, 1, 0], [4, 0, 3], [0, 0, 1]])
label = 0.0
point = LabeledPoint(label, SparseVector(3, [0, 2], [1.0, 3.0]))
 
textRDD = sc.textFile("README.md")
print textRDD.count()
示例#8
0
# Created by Raju Kumar Mishra
# Book PySpark Recipes
# Chapter 9
# Recipe 9-2.  Create a Sparse Vector.
# Run following PySpark code lines, line by line in PySpark shell

from pyspark.mllib.linalg import SparseVector
sparseDataList = [1.0, 3.2]
sparseDataVector = SparseVector(8, [0, 7], sparseDataList)
sparseDataVector
sparseDataVector[1]
sparseDataVector[7]
sparseDataVector.numNonzeros()
sparseDataList1 = [3.0, 1.4, 2.5, 1.2]
sparseDataVector1 = SparseVector(8, [0, 3, 4, 6], sparseDataList1)
squaredDistance = sparseDataVector.squared_distance(sparseDataVector1)
squaredDistance
def add_sparse_vector(vec1, vec2):
    t = vec1.toArray() + vec2.toArray()
    idx, val = get_sparse_index(t)
    return SparseVector(NUMBER_OF_CATEGORY, idx, val)
示例#10
0
def to_sparse_vector(t):
    return SparseVector(size, t)
        # print (Vectors.sparse(row[0][0],row[0][1],row[0][2]) )
        print(row)
        break

    exit()
    """

    # zipping the word with its probability distribution for that topic
    termsRDD = topicsRDD.map(lambda topic:
                             (zip(itemgetter(*topic[0])(vocablist), topic[1])))

    zippedRDD = topicsRDD.map(lambda topic: (zip(topic[0], topic[1])))

    # for Every topic, sparse vector of distribution over words
    docCalcs = zippedRDD.map(lambda topic: DenseVector(
        (SparseVector(vocabSize, topic)).toArray()))

    #schema = StructType([StructField("topicwordDistribution", Vector(), False)])

    #df = sqlContext.applySchema(docCalcs, schema)

    #docCalcs = docCalcs.map(lambda l: Row(l))

    docCalcs = docCalcs.zipWithIndex()

    #docCalcs = docCalcs.collect()

    docCalcs = sqlContext.createDataFrame(docCalcs,
                                          ['topicwordDistribution', 'topicId'])

    #print(type(docCalcs))
示例#12
0
    from pyspark.mllib.feature import HashingTF, IDF

    hashingTF = HashingTF()
    tf = hashingTF.transform(Positive_Reviews_body)

    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    tfidf_T = tfidf \
        .zipWithIndex() \
        .flatMap(explode) \
        .map(lambda x: (x[1], [x[0], x[2]])) \
        .reduceByKey(lambda x, y: np.vstack([x, y])) \
        .map(lambda x: (x[0], np.array(x[1]).reshape(-1, 2))) \
        .map(lambda x: (x[0], x[1][x[1][:, 0].argsort()])) \
        .map(lambda x: (x[0], SparseVector(num_Postive_Sentence, x[1][:, 0], x[1][:, 1])))

    cosine_similarity = IndexedRowMatrix(tfidf_T).columnSimilarities()

    sim_matrix_full = cosine_similarity.entries \
        .flatMap(lambda x: ((x.j, x.i, x.value), (x.i, x.j, x.value)))

    avg_dist_each = sim_matrix_full \
        .map(lambda x: (x[0], (1, 1-x[2]))) \
        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
        .map(lambda x: (x[0], (x[1][1] + num_Postive_Sentence - x[1][0] - 1)/ (num_Postive_Sentence - 1)))

    avg_dist_each_vector = np.array(avg_dist_each.collect())
    avg_dist_overall = np.mean(avg_dist_each_vector)

    center_index = avg_dist_each_vector[avg_dist_each_vector[:,
示例#13
0
 def change_labelPoint(rdd, total_feas):
     label_point = LabeledPoint(
         rdd['label'], SparseVector(total_feas, rdd['pos'], rdd['val']))
     return label_point
示例#14
0
sampleOHEDictManual[(2, 'salmon')] = 6

# COMMAND ----------

# MAGIC %md #### ** (1b) Sparse vectors **
# MAGIC #### Data points can typically be represented with a small number of non-zero OHE features relative to the total number of features that occur in the dataset.  By leveraging this sparsity and using sparse vector representations of OHE data, we can reduce storage and computational burdens.  Below are a few sample vectors represented as dense numpy arrays.

# COMMAND ----------

import numpy as np
from pyspark.mllib.linalg import SparseVector

# COMMAND ----------

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [1, 3], [3., 4.])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [3], [1.])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)

# COMMAND ----------

# MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0.  For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`.

# COMMAND ----------
def main():
    st_time = time.time()

    train_percentage = 0.67
    conf = (SparkConf().setMaster('local[*]').set(
        'spark.executor.memory',
        '4G').set('spark.driver.memory',
                  '45G').set('spark.driver.maxResultSize', '10G'))
    sc = SparkContext(conf=conf)
    if False:
        cid = 000000  # representing ps data
        # filename = 'ps_train.svm'
        # sc = SparkContext("local", "Simple App")
        # filename = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/models/2017-09-29/ps.51/training_set'
        filename = '../ps_data/ps_oct/training_set'
        # sc = SparkContext(conf=SparkConf().setAppName("ps_spark_grid")
        # conf = (SparkConf().set('spark.yarn.executor.memoryOverhead', '4096').set('spark.kryoserializer.buffer.max.mb', '2047').set('spark.driver.maxResultSize','2g'))

        data = sc.textFile(filename)
        # labels_sca = data.map(lambda x: int(x[0])) # int type
        labels_sca = data.map(lambda line: line.split(',')).map(
            lambda y: float(y[len(y) - 1]))
        nbr_samples = data.count()
        # nbr_samples = 10000
        l_sca = np.array(labels_sca.take(nbr_samples))
        #l, _ = fOnehot_encode(labels_sca.take(nbr_samples))
        l = np.column_stack([np.array(l_sca), 1 - np.array(l_sca)])

        # features = data.map(lambda x: x.split(' ')).map(lambda y: [int(y[i][-1]) for i in range(902)])
        features = data.map(lambda line: line.split(',')).map(
            lambda y: [float(y[i]) for i in range(len(y) - 1)])
        X = np.array(features.take(nbr_samples))
        nbr_feature = len(X[0])
        print('nbr of features: ' + str(nbr_feature))

        # data_train, _ = fSplitTrainAndTest(X, l, l_sca, train_percentage)
        data_train, data_test = fSplitTrainAndTest(X, l, l_sca,
                                                   train_percentage)

    ##### uncomment this if try using another testing set
    nbr_feature = 600
    # filename_test_new = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/xg/test_data/2017-09-20/ps.51/part-r-01088'
    filename_test_new = '../ps_data/part-r-01088'
    new_data_test = sc.textFile(filename_test_new)
    nbr_samples_test = new_data_test.count()
    # nbr_samples_test = 10000
    data2 = new_data_test.map(lambda line: line.split('\t')).map(
        lambda x: x[1])
    labels = data2.map(lambda x: float(x[0]))
    feature_str = data2.map(lambda x: x[2:])
    t2 = feature_str.map(lambda lines: lines.split(' '))
    features = t2.map(lambda x: DenseVector(
        SparseVector(nbr_feature,
                     {int(i.split(':')[0]): float(i.split(':')[1])
                      for i in x})))
    l_sca_test = np.array(labels.take(nbr_samples_test))
    l_test = np.column_stack([np.array(l_sca_test), 1 - np.array(l_sca_test)])
    X_test = np.array(features.take(nbr_samples_test))
    # data_test = Data(X_test, l_test, l_sca_test)

    data_train, data_test = fSplitTrainAndTest(X_test, l_test, l_sca_test,
                                               train_percentage)
    # # ####

    # data_train = Data(X, l, l_sca)
    n = len(data_train.X)  # total number of training samples
    d = len(data_train.X[0])  # number of features
    ll = len(data_train.labels[0])  #output dimension
    # print (n)
    # print (d)
    # print (ll)

    # Create the model
    x = tf.placeholder(tf.float32, [None, d])
    keep_prob = tf.placeholder(tf.float32)

    # if False:
    # 	y = deepnn(x, d, ll)
    # else:
    # y = deepnn_withBN(x, d, ll, 3, keep_prob)
    nbr_of_layers = 2
    nbr_layer1 = 250
    nbr_layer2 = 350
    epsilon = 1e-3

    x_drop = tf.nn.dropout(x, keep_prob)  # adding dropout in the input layer
    # x_drop = x # no dropout on input layer
    W1 = weight_variable([d, nbr_layer1])
    b1 = bias_variable([nbr_layer1])
    z1 = tf.matmul(x_drop, W1) + b1
    batch_mean1, batch_var1 = tf.nn.moments(z1, [0])
    z1_hat = (z1 - batch_mean1) / tf.sqrt(batch_var1 + epsilon)
    scale1 = tf.Variable(tf.ones([nbr_layer1]))
    beta1 = tf.Variable(tf.zeros([nbr_layer1]))
    #b1 = bias_variable([nbr_layer1])
    h1 = tf.nn.relu(scale1 * z1_hat + beta1)
    h1_drop = tf.nn.dropout(h1, keep_prob)
    if nbr_of_layers == 2:

        W2 = weight_variable([nbr_layer1, ll])
        b2 = bias_variable([ll])
        y = tf.matmul(h1_drop, W2) + b2
    #h1 = tf.nn.sigmoid(scale1*z1_hat + beta1)
    else:
        W2 = weight_variable([nbr_layer1, nbr_layer2])
        b2 = bias_variable([nbr_layer2])
        z2 = tf.matmul(h1_drop, W2) + b2
        batch_mean2, batch_var2 = tf.nn.moments(z2, [0])
        z2_hat = (z2 - batch_mean2) / tf.sqrt(batch_var2 + epsilon)
        scale2 = tf.Variable(tf.ones([nbr_layer2]))
        beta2 = tf.Variable(tf.zeros([nbr_layer2]))
        h2 = tf.nn.relu(scale2 * z2_hat + beta2)
        h2_drop = tf.nn.dropout(h2, keep_prob)
        #h2 = tf.nn.sigmoid(scale2*z2_hat + beta2)

        W3 = weight_variable([nbr_layer2, ll])
        b3 = bias_variable([ll])

        y = tf.matmul(h2_drop, W3) + b3

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, ll])

    tf.summary.histogram('W1', W1)
    tf.summary.histogram('W2', W2)
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    starter_learning_rate = 0.01
    global_step = tf.Variable(0, trainable=False)
    # train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
    learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                               global_step,
                                               decay_steps=5000,
                                               decay_rate=0.95,
                                               staircase=True,
                                               name=None)
    # train_step = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy, global_step = global_step)

    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        cross_entropy, global_step=global_step)
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    auc_ftrain = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                                tf.cast(tf.argmax(y_, 1), tf.float32))
    auc_ftest = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                               tf.cast(tf.argmax(y_, 1), tf.float32))
    softmaxed_logits = tf.nn.softmax(y)

    tf.local_variables_initializer().run()
    sess.run(tf.initialize_local_variables())
    tf.summary.scalar('cross_entropy', cross_entropy)
    tf.summary.scalar('accuracy', accuracy)
    tf.summary.scalar('auc_ftrain', auc_ftrain[0])
    tf.summary.scalar('auc_ftest', auc_ftest[0])

    train_writer = tf.summary.FileWriter("/tmp/histogram_example/train",
                                         sess.graph)
    test_writer = tf.summary.FileWriter("/tmp/histogram_example/test")

    # writer = tf.summary.FileWriter("/tmp/histogram_example")
    summaries = tf.summary.merge_all()
    # save
    st = np.array([])

    ac_train = np.array([])
    ca_train = np.array([])
    auc_train = np.array([])

    ac_test = np.array([])
    ca_test = np.array([])
    auc_test = np.array([])

    batch_size = 100

    for i in range(100):

        # train the whole epoch (first shuffle the data)
        idx = np.arange(0, n)
        np.random.shuffle(idx)
        X_shuffle = [data_train.X[k] for k in idx]
        labels_shuffle = [data_train.labels[k] for k in idx]

        for j in range(int(n / batch_size)):
            batch_xs = X_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            batch_ys = labels_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            sess.run(train_step,
                     feed_dict={
                         x: batch_xs,
                         y_: batch_ys,
                         keep_prob: 0.5
                     })

        # finish training, try on testing data
        if i % 10 is 0:
            print(i)

            soft_logits_train, summary_train, ca_train_i, ac_train_i, auc_train_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftrain
                ],
                feed_dict={
                    x: data_train.X,
                    y_: data_train.labels,
                    keep_prob: 1.0
                })

            soft_logits_test, summary_test, ca_test_i, ac_test_i, auc_test_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftest
                ],
                feed_dict={
                    x: data_test.X,
                    y_: data_test.labels,
                    keep_prob: 1.0
                })
            # [ca_test_i, ac_test_i,auc_test_i] = [0, 0, [0, 0]]
            #train_writer.add_summary(summary_train, i)
            #test_writer.add_summary(summary_test, i)

            # print (soft_logits_train)
            # print (data_train.labels)
            sk_auc_train = metrics.roc_auc_score(
                y_true=np.array(data_train.labels),
                y_score=np.array(soft_logits_train))
            sk_auc_test = metrics.roc_auc_score(
                y_true=np.array(data_test.labels),
                y_score=np.array(soft_logits_test))
            print('learning rate: ' + str(sess.run(learning_rate)))

            print('train cross entropy: ' + str(ca_train_i))
            print('test cross entropy: ' + str(ca_test_i))

            print('train accuracy: ' + str(ac_train_i))
            print('test accuracy: ' + str(ac_test_i))

            print('train auc: ' + str(auc_train_i[0]))
            print('test auc: ' + str(auc_test_i[0]))

            print('train sk auc: ' + str(sk_auc_train))
            print('test sk auc: ' + str(sk_auc_test))
            # print ('train auc sk' + str(auc_sk_train))
            # print ('test auc sk' + str(auc_sk_test))

        # ca_test, ac_test, auc_test = sess.run([cross_entropy, accuracy, auc], feed_dict={x: data_test.X, y_: data_test.labels, keep_prob: 1.0})
        # print ('test cross entropy: ' + str(ca_test))
        # print ('test accuracy: ' + str(ac_test))
        # print ('test auc: '+ str(auc_test[0]))
    sess.close()
    sc.stop()

    end_time = time.time()
    print('run time: ' + str(round(end_time - st_time)) + ' seconds')
    print('tensorboard --logdir=/tmp/histogram_example')
    return 1
示例#16
0
# Use a single-column SciPy csc_matrix as a sparse vector.
sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))





# Example 11-3
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))






# Example 11-5
from numpy import array
from pyspark.mllib.linalg import Vectors

# Create the dense vector <1.0, 2.0, 3.0> 
denseVec1 = array([1.0, 2.0, 3.0]) # NumPy arrays can be passed directly to MLlib
denseVec2 = Vectors.dense([1.0, 2.0, 3.0]) # .. or you can use the Vectors class

# Create the sparse vector <1.0, 0.0, 2.0, 0.0>; the methods for this take only
示例#17
0
    terms = tags.split()

    # filter words that not exist in the vocabulary
    terms = [x for x in list(set(terms)) if x in list(set(vocabulary))]

    indices = list(map(lambda x: vocabulary.index(x), list(set(terms))))
    indices.sort()
    occurrences = list(
        map(lambda x: float(terms.count(vocabulary[x])), indices))

    return [len(vocabulary), indices, occurrences]


conf = SparkConf()
conf.setAppName("NaiveBaye")
conf.set('spark.driver.memory', '6g')
conf.set('spark.executor.memory', '6g')
conf.set('spark.cores.max', 156)

#load tags passed as parameter
tags = sys.argv[1]
bow = bow(tags)  #bag of words of that tags

sc = SparkContext(conf=conf)  # SparkContext

model = NaiveBayesModel.load(sc, "model")

result = model.predict(SparseVector(bow[0], bow[1], bow[2]))

print str(classValues[result])
示例#18
0
    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model
    output_dir = 'output/'
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('sameModel accuracy {}'.format(accuracy))

    from pyspark.mllib.linalg import SparseVector
    testsparsevector = SparseVector(692, [5, 6], [5.0, 6.0])

    print(sameModel.predict(testsparsevector))

    # $example off$
    sc.stop()
示例#19
0
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
sqlc = SQLContext(sc)

print(StructField , "go go go")

#df = sqlc.read.csv("hdfs://hadoop1:9000/home/hadoop/test.csv",header=True)

sel = pd.read_csv("selldata/selldata")

df = sqlc.read.csv("selldata/selldata",header=True)
print(sel.columns)
print(df)

tran = df.rdd.map(lambda x:LabeledPoint(list(x)[9], SparseVector(7, [i for i in range(7)],list(x)[2:9])))

print(len(tran.collect()))


#tran = data.rdd.map(lambda x:LabeledPoint(list(x)[17], SparseVector(16, [i for i in range(16)],list(x)[1:17])))

#model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10)

#model.save(sc,"./gbdymodelonlionev1")
#
# a = StructType([
#     StructField("ID",StringType(),False),
#     StructField("cust_type", StringType(), True),
#     StructField("cust_level", IntegerType(), True),
#     StructField("ID",StringType(),False),
示例#20
0
def parseVector(line):
    _,indices_tuple_ls = line.split('\t')
    indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list.
    return SparseVector(TOTAL_DOCS,indices_tuple_ls)
def vectorizeUni(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryUni[w]] = 1
    return SparseVector(len(dictionaryUni), vector_dict)
示例#22
0
inputNum_min = float(finalDF1.select(min('inputNum').alias('min_inputNum')).collect()[0]['min_inputNum'])
inputNum_max = float(finalDF1.select(max('inputNum').alias('max_inputNum')).collect()[0]['max_inputNum'])
Min_v = inputNum_min
Max_v = inputNum_max
Norm_inputNum_function = udf(lambda v: (float(v) - Min_v) / (Max_v - Min_v), DoubleType())
finalDF2 = finalDF2.withColumn('Norm_inputNum', Norm_inputNum_function(finalDF2.inputNum))


%pyspark
#conduct OneHotEncoder for project_index column
from pyspark.ml.feature import OneHotEncoder

finalDF2.registerTempTable("dfData")
finalDF2 = spark.sql("SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData")

encoder = OneHotEncoder(dropLast=False, inputCol="project_index", outputCol="project_Vec")
encoded = encoder.transform(finalDF2)

%pyspark
encoded.registerTempTable("dfData")
finalDF3 = spark.sql("SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData")

from pyspark.mllib.linalg import SparseVector
import numpy as np
#824 should be revised according to your onehotcode result
RDD = finalDF3.rdd.map(lambda line: SparseVector(824, line["project_Vec"].indices.tolist() + [821, 822, 823], line["project_Vec"].values.tolist() + [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache()

%pyspark
from pyspark.mllib.clustering import KMeans
clusters = KMeans.train(RDD, 2, maxIterations=10, runs=10, initializationMode="k-means||")
def vectorizeBi(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryBigrams[w]] = 1
    return SparseVector(len(dictionaryBigrams), vector_dict)
示例#24
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
示例#25
0
def vectorize(ratings, numMovies):
    return ratings.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: SparseVector(numMovies, x))
示例#26
0
    LabeledPoint(6.0, [3.0, 4.0])
]  # 训练集
lrm = LinearRegressionWithSGD.train(sc.parallelize(data),
                                    iterations=100,
                                    initialWeights=np.array([1.0, 1.0]))
print(lrm.predict(np.array([2.0, 1.0])))  # 利用训练出的回归模型进行预测

import os, tempfile
from pyspark.mllib.regression import LinearRegressionModel
from pyspark.mllib.linalg import SparseVector

path = tempfile.mkdtemp()
lrm.save(sc, path)  # 将模型保存至外存
sameModel = LinearRegressionModel.load(sc, path)  # 读取模型
print(sameModel.predict(SparseVector(2, {
    0: 100.0,
    1: 150
})))  # 利用稀疏向量作为数据结构,返回单个预测值
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(SparseVector(2, {0: i, 1: j}))
print(sameModel.predict(sc.parallelize(test_set)).collect())  # 预测多值,返回一个RDD数据集
print(sameModel.weights)  # 返回参数

# -----------------岭回归------------------

from pyspark.mllib.regression import RidgeRegressionWithSGD

data = [
    LabeledPoint(1.0, [1.0, 1.0]),
    LabeledPoint(4.0, [1.0, 3.0]),
示例#27
0
dv1
#array([ 2.,  0.,  5.])

# Sparse vector uses integer indices and double values.

sv1 = Vectors.sparse(2, [0, 3], [5.0, 1.0])
sv1
#SparseVector(2, {0: 5.0, 3: 1.0})

# Labeled Point:  This can be dense or Sparse vector with a label used in supervised learning.
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Labeled point with a positive label and a dense feature vector
lp_pos = LabeledPoint(1.0, [4.0, 0.0, 2.0])
lp_pos
# LabeledPoint(1.0, [4.0,0.0,2.0])

# Labeled point with a negative label and a sparse feature vector
lp_neg = LabeledPoint(0.0, SparseVector(5, [1, 2], [3.0, 5.0]))
lp_neg
#LabeledPoint(0.0, (5,[1,2],[3.0,5.0]))

# Local Matrix:  This is a matrix with integer type indices and double type values.  This is also stored on single machine.
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
示例#28
0
def vectorizeBi(row,dico):
    vector_dict={}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]]=1
    return (row.label,SparseVector(len(dico),vector_dict))
示例#29
0
pdline = pandas.read_csv("/Users/xiejinxin/datafloder/test/copy.csv")

spkdf = sqlc.createDataFrame(pdline)

print(pdline)

print(spkdf)
spkdf.createOrReplaceTempView("tmp1")
data = sqlc.sql(
    "select row_number() over(order by relation_year) as id, cust_level,relation_year,sex,age,cust_status,is_xyk_kk,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,if(finance_bal > 0,1,0) as label from tmp1"
)

data.show()
tran = data.rdd.map(lambda x: LabeledPoint(
    list(x)[17], SparseVector(16, [i for i in range(16)],
                              list(x)[1:17])))
#cust_level,relation_year,sex,age,eva_bal_rmb,cnt,transamt,liucun_bal
#cust_level,relation_year,sex,age,cust_status,is_xyk_kk,degree,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,finance_bal,finance_bal_bb,finance_bal_fbb,invest_bal,ldjj_bal,gz_aum_bal,b_aum_bal,gold_bal,trust_bal,insurance_bal,third_bal,loan_house_bal,loan_car_bal,loan_mana_bal,loan_stuty_bal,loan_other_bal,ola_aum_bal,b_z_cd_aum_bal,loan_z_cd,zhc_aum_bal,jer_bal,dly_bal,hxlc_bal,jeqj_bal,jegd_bal,jewy_bal,dzzh_bal,decd_bal,xfc_aum_bal,jj_tot_vol,card_xy_bal_last_m_avg,card_xy_bal_last_m_avg_y,card_swing_bal_avg,card_swing_bal_avg_y,card_swing_num_avg,card_swing_num_avg_y,corpname,tran_amt_1m,tran_num_1m,tran_amt_3m,tran_num_3m,tran_amt_6m,tran_num_6m,day_cnt,tran_wy_amt_1m,tran_wy_num_1m,tran_wy_amt_3m,tran_wy_num_3m,tran_wy_amt_6m,tran_wy_num_6m,day_wy_cnt,tran_dz_amt_1m,tran_dz_num_1m,tran_dz_amt_3m,tran_dz_num_3m,tran_dz_amt_6m,tran_dz_num_6m,day_dz_cnt,tran_atm_amt_1m,tran_atm_num_1m,tran_atm_amt_3m,tran_atm_num_3m,tran_atm_amt_6m,tran_atm_num_6m,day_atm_cnt,tran_gt_amt_1m,tran_gt_num_1m,tran_gt_amt_3m,tran_gt_num_3m,tran_gt_amt_6m,tran_gt_num_6m,day_gt_cnt,tran_pos_amt_1m,tran_pos_num_1m,tran_pos_amt_3m,tran_pos_num_3m,tran_pos_amt_6m,tran_pos_num_6m,day_pos_cnt,tran_sj_amt_1m,tran_sj_num_1m,tran_sj_amt_3m,tran_sj_num_3m,tran_sj_amt_6m,tran_sj_num_6m,day_sj_cnt,tran_dh_amt_1m,tran_dh_num_1m,tran_dh_amt_3m,tran_dh_num_3m,tran_dh_amt_6m,tran_dh_num_6m,day_dh_cnt,is_despoit,is_fixed,is_finance,is_fund,is_gz_aum,is_insurance,is_gold,is_third,is_trust,is_loan,is_cbank,is_xyk,is_finance_bb,is_finance_fbb,is_ldjj,is_loan_house,is_loan_car,is_loan_mana,is_loan_stuty,is_loan_other,is_ola_aum,is_zhc_aum,is_jer,is_dly,is_hxlc,is_jeqj,is_jewy,is_decd,is_xfc_aum,'

model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10)

model.save(sc, "./gbdymodelonlionev1")

a = StructType([
    StructField("ID", StringType(), False),
    StructField("cust_type", StringType(), True),
    StructField("cust_level", IntegerType(), True)
])

print(a)