''' from pyspark.mllib.linalg import SparseVector from collections import Counter from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext('local', 'term_doc') corpus = sc.parallelize([ "It is the east, and Juliet is the sun.", "A dish fit for the gods.", "Brevity is the soul of wit." ]) tokens = corpus.map(lambda raw_text: raw_text.split()).cache() local_vocab_map = tokens.flatMap( lambda token: token).distinct().zipWithIndex().collectAsMap() vocab_map = sc.broadcast(local_vocab_map) vocab_size = sc.broadcast(len(local_vocab_map)) term_document_matrix = tokens \ .map(Counter) \ .map(lambda counts: {vocab_map.value[token]: float(counts[token]) for token in counts}) \ .map(lambda index_counts: SparseVector(vocab_size.value, index_counts)) for doc in term_document_matrix.collect(): print doc
# filter out those records that don't have simialr items item_item_matrix_haveRatings = item_item_matrix.filter(lambda x: ',' in x) item_item_matrix = (item_item_matrix_haveRatings.map( lambda x: parser_iiMatrix_and_compute_SparseMatrix( x, broadcast_Mapping_item.value, broadcast_Mapping_user.value))) def sparseAdd(sv1, sv2, length): from pyspark.mllib.linalg import Vectors combinedV = sv1.toArray() + sv2.toArray() nonzeroes = combinedV.nonzero()[0] return Vectors.sparse(length, nonzeroes, combinedV[nonzeroes]) # Test sparseAdd function o = SparseVector(2241, [771, 806, 1209, 1574], [1.0, 1.0, 1.0, 1.0]) k = SparseVector(2241, [305, 1253, 1254], [1.0, 1.0, 1.0]) Test.assertEquals(SparseVector(2241, [305, 1253, 1254], [2.0, 2.0, 2.0]), sparseAdd(k, k, k.size), 'sparseAdd function malfunc') Test.assertEquals( SparseVector(2241, [771, 806, 1209, 1574], [2.0, 2.0, 2.0, 2.0]), sparseAdd(o, o, o.size), 'sparseAdd function malfunc') Test.assertEquals( SparseVector(2241, [305, 771, 806, 1209, 1253, 1254, 1574], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), sparseAdd(o, k, o.size), 'sparseAdd function malfunc') def matrixMultiplication(sv, item_item_matrix): indices = sv.indices # return item_item_matrix.filter(lambda (x,y):x in indices).map(lambda (x,y):y).reduce(lambda x,y:sparseAdd(x,y,y.size))
for v in values: if v in vocab[col]: word_indices.append(start_index + vocab[col].index(v)) for k, v in sorted(six.iteritems(Counter(word_indices))): feature_indices.append(k) feature_values.append(float(v)) start_index += len(vocab[col]) if col == target_col: label = vocab[col].index(col_value) if classification else col_value return {"label": label, "indices": feature_indices, "values": feature_values} return process_rows process_row_fn = make_process_rows_fn( classification, args.target, text_columns, category_columns, number_columns, vocab, stats) dfs = [] if args.train: dfTrain = spark.read.schema(schema).csv(args.train) dfs.append(("train", dfTrain)) if args.eval: dfEval = spark.read.schema(schema).csv(args.eval) dfs.append(("eval", dfEval)) for name, df in dfs: rdd = df.rdd.map(process_row_fn).map( lambda row: LabeledPoint(row["label"], SparseVector(feature_size, row["indices"], row["values"]))) MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))
# coding=utf-8 from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) print(pos) print(neg)
class VectorUDTTests(MLlibTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): rdd = self.sc.parallelize( [LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) def test_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(Vectors.dense(1))]) row_matrix = RowMatrix(df) self.assertEqual(row_matrix.numRows(), 1) self.assertEqual(row_matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): RowMatrix(df.selectExpr("'monkey'")) def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1")) def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize( [IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) #print(cars.toPandas().sample(12)) # Check column data types print('\n', cars.dtypes, '\n') kars = cars.select('name', 'type', 'type_idx') print(kars.toPandas().sample(12)) onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy']) onehot = onehot.fit(kars) kars = onehot.transform(kars) kars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show() print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0])) print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0})) spark.stop()
import numpy as np from scipy.sparse import csr_matrix from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint M = csr_matrix([[4, 1, 0], [4, 0, 3], [0, 0, 1]]) label = 0.0 point = LabeledPoint(label, SparseVector(3, [0, 2], [1.0, 3.0])) textRDD = sc.textFile("README.md") print textRDD.count()
# Created by Raju Kumar Mishra # Book PySpark Recipes # Chapter 9 # Recipe 9-2. Create a Sparse Vector. # Run following PySpark code lines, line by line in PySpark shell from pyspark.mllib.linalg import SparseVector sparseDataList = [1.0, 3.2] sparseDataVector = SparseVector(8, [0, 7], sparseDataList) sparseDataVector sparseDataVector[1] sparseDataVector[7] sparseDataVector.numNonzeros() sparseDataList1 = [3.0, 1.4, 2.5, 1.2] sparseDataVector1 = SparseVector(8, [0, 3, 4, 6], sparseDataList1) squaredDistance = sparseDataVector.squared_distance(sparseDataVector1) squaredDistance
def add_sparse_vector(vec1, vec2): t = vec1.toArray() + vec2.toArray() idx, val = get_sparse_index(t) return SparseVector(NUMBER_OF_CATEGORY, idx, val)
def to_sparse_vector(t): return SparseVector(size, t)
# print (Vectors.sparse(row[0][0],row[0][1],row[0][2]) ) print(row) break exit() """ # zipping the word with its probability distribution for that topic termsRDD = topicsRDD.map(lambda topic: (zip(itemgetter(*topic[0])(vocablist), topic[1]))) zippedRDD = topicsRDD.map(lambda topic: (zip(topic[0], topic[1]))) # for Every topic, sparse vector of distribution over words docCalcs = zippedRDD.map(lambda topic: DenseVector( (SparseVector(vocabSize, topic)).toArray())) #schema = StructType([StructField("topicwordDistribution", Vector(), False)]) #df = sqlContext.applySchema(docCalcs, schema) #docCalcs = docCalcs.map(lambda l: Row(l)) docCalcs = docCalcs.zipWithIndex() #docCalcs = docCalcs.collect() docCalcs = sqlContext.createDataFrame(docCalcs, ['topicwordDistribution', 'topicId']) #print(type(docCalcs))
from pyspark.mllib.feature import HashingTF, IDF hashingTF = HashingTF() tf = hashingTF.transform(Positive_Reviews_body) idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf_T = tfidf \ .zipWithIndex() \ .flatMap(explode) \ .map(lambda x: (x[1], [x[0], x[2]])) \ .reduceByKey(lambda x, y: np.vstack([x, y])) \ .map(lambda x: (x[0], np.array(x[1]).reshape(-1, 2))) \ .map(lambda x: (x[0], x[1][x[1][:, 0].argsort()])) \ .map(lambda x: (x[0], SparseVector(num_Postive_Sentence, x[1][:, 0], x[1][:, 1]))) cosine_similarity = IndexedRowMatrix(tfidf_T).columnSimilarities() sim_matrix_full = cosine_similarity.entries \ .flatMap(lambda x: ((x.j, x.i, x.value), (x.i, x.j, x.value))) avg_dist_each = sim_matrix_full \ .map(lambda x: (x[0], (1, 1-x[2]))) \ .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \ .map(lambda x: (x[0], (x[1][1] + num_Postive_Sentence - x[1][0] - 1)/ (num_Postive_Sentence - 1))) avg_dist_each_vector = np.array(avg_dist_each.collect()) avg_dist_overall = np.mean(avg_dist_each_vector) center_index = avg_dist_each_vector[avg_dist_each_vector[:,
def change_labelPoint(rdd, total_feas): label_point = LabeledPoint( rdd['label'], SparseVector(total_feas, rdd['pos'], rdd['val'])) return label_point
sampleOHEDictManual[(2, 'salmon')] = 6 # COMMAND ---------- # MAGIC %md #### ** (1b) Sparse vectors ** # MAGIC #### Data points can typically be represented with a small number of non-zero OHE features relative to the total number of features that occur in the dataset. By leveraging this sparsity and using sparse vector representations of OHE data, we can reduce storage and computational burdens. Below are a few sample vectors represented as dense numpy arrays. # COMMAND ---------- import numpy as np from pyspark.mllib.linalg import SparseVector # COMMAND ---------- aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [1, 3], [3., 4.]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [3], [1.]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w) print bDense.dot(w) print bSparse.dot(w) # COMMAND ---------- # MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0. For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`. # COMMAND ----------
def main(): st_time = time.time() train_percentage = 0.67 conf = (SparkConf().setMaster('local[*]').set( 'spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')) sc = SparkContext(conf=conf) if False: cid = 000000 # representing ps data # filename = 'ps_train.svm' # sc = SparkContext("local", "Simple App") # filename = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/models/2017-09-29/ps.51/training_set' filename = '../ps_data/ps_oct/training_set' # sc = SparkContext(conf=SparkConf().setAppName("ps_spark_grid") # conf = (SparkConf().set('spark.yarn.executor.memoryOverhead', '4096').set('spark.kryoserializer.buffer.max.mb', '2047').set('spark.driver.maxResultSize','2g')) data = sc.textFile(filename) # labels_sca = data.map(lambda x: int(x[0])) # int type labels_sca = data.map(lambda line: line.split(',')).map( lambda y: float(y[len(y) - 1])) nbr_samples = data.count() # nbr_samples = 10000 l_sca = np.array(labels_sca.take(nbr_samples)) #l, _ = fOnehot_encode(labels_sca.take(nbr_samples)) l = np.column_stack([np.array(l_sca), 1 - np.array(l_sca)]) # features = data.map(lambda x: x.split(' ')).map(lambda y: [int(y[i][-1]) for i in range(902)]) features = data.map(lambda line: line.split(',')).map( lambda y: [float(y[i]) for i in range(len(y) - 1)]) X = np.array(features.take(nbr_samples)) nbr_feature = len(X[0]) print('nbr of features: ' + str(nbr_feature)) # data_train, _ = fSplitTrainAndTest(X, l, l_sca, train_percentage) data_train, data_test = fSplitTrainAndTest(X, l, l_sca, train_percentage) ##### uncomment this if try using another testing set nbr_feature = 600 # filename_test_new = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/xg/test_data/2017-09-20/ps.51/part-r-01088' filename_test_new = '../ps_data/part-r-01088' new_data_test = sc.textFile(filename_test_new) nbr_samples_test = new_data_test.count() # nbr_samples_test = 10000 data2 = new_data_test.map(lambda line: line.split('\t')).map( lambda x: x[1]) labels = data2.map(lambda x: float(x[0])) feature_str = data2.map(lambda x: x[2:]) t2 = feature_str.map(lambda lines: lines.split(' ')) features = t2.map(lambda x: DenseVector( SparseVector(nbr_feature, {int(i.split(':')[0]): float(i.split(':')[1]) for i in x}))) l_sca_test = np.array(labels.take(nbr_samples_test)) l_test = np.column_stack([np.array(l_sca_test), 1 - np.array(l_sca_test)]) X_test = np.array(features.take(nbr_samples_test)) # data_test = Data(X_test, l_test, l_sca_test) data_train, data_test = fSplitTrainAndTest(X_test, l_test, l_sca_test, train_percentage) # # #### # data_train = Data(X, l, l_sca) n = len(data_train.X) # total number of training samples d = len(data_train.X[0]) # number of features ll = len(data_train.labels[0]) #output dimension # print (n) # print (d) # print (ll) # Create the model x = tf.placeholder(tf.float32, [None, d]) keep_prob = tf.placeholder(tf.float32) # if False: # y = deepnn(x, d, ll) # else: # y = deepnn_withBN(x, d, ll, 3, keep_prob) nbr_of_layers = 2 nbr_layer1 = 250 nbr_layer2 = 350 epsilon = 1e-3 x_drop = tf.nn.dropout(x, keep_prob) # adding dropout in the input layer # x_drop = x # no dropout on input layer W1 = weight_variable([d, nbr_layer1]) b1 = bias_variable([nbr_layer1]) z1 = tf.matmul(x_drop, W1) + b1 batch_mean1, batch_var1 = tf.nn.moments(z1, [0]) z1_hat = (z1 - batch_mean1) / tf.sqrt(batch_var1 + epsilon) scale1 = tf.Variable(tf.ones([nbr_layer1])) beta1 = tf.Variable(tf.zeros([nbr_layer1])) #b1 = bias_variable([nbr_layer1]) h1 = tf.nn.relu(scale1 * z1_hat + beta1) h1_drop = tf.nn.dropout(h1, keep_prob) if nbr_of_layers == 2: W2 = weight_variable([nbr_layer1, ll]) b2 = bias_variable([ll]) y = tf.matmul(h1_drop, W2) + b2 #h1 = tf.nn.sigmoid(scale1*z1_hat + beta1) else: W2 = weight_variable([nbr_layer1, nbr_layer2]) b2 = bias_variable([nbr_layer2]) z2 = tf.matmul(h1_drop, W2) + b2 batch_mean2, batch_var2 = tf.nn.moments(z2, [0]) z2_hat = (z2 - batch_mean2) / tf.sqrt(batch_var2 + epsilon) scale2 = tf.Variable(tf.ones([nbr_layer2])) beta2 = tf.Variable(tf.zeros([nbr_layer2])) h2 = tf.nn.relu(scale2 * z2_hat + beta2) h2_drop = tf.nn.dropout(h2, keep_prob) #h2 = tf.nn.sigmoid(scale2*z2_hat + beta2) W3 = weight_variable([nbr_layer2, ll]) b3 = bias_variable([ll]) y = tf.matmul(h2_drop, W3) + b3 # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, ll]) tf.summary.histogram('W1', W1) tf.summary.histogram('W2', W2) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) starter_learning_rate = 0.01 global_step = tf.Variable(0, trainable=False) # train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, decay_steps=5000, decay_rate=0.95, staircase=True, name=None) # train_step = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy, global_step = global_step) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cross_entropy, global_step=global_step) sess = tf.InteractiveSession() tf.global_variables_initializer().run() correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) auc_ftrain = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32), tf.cast(tf.argmax(y_, 1), tf.float32)) auc_ftest = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32), tf.cast(tf.argmax(y_, 1), tf.float32)) softmaxed_logits = tf.nn.softmax(y) tf.local_variables_initializer().run() sess.run(tf.initialize_local_variables()) tf.summary.scalar('cross_entropy', cross_entropy) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('auc_ftrain', auc_ftrain[0]) tf.summary.scalar('auc_ftest', auc_ftest[0]) train_writer = tf.summary.FileWriter("/tmp/histogram_example/train", sess.graph) test_writer = tf.summary.FileWriter("/tmp/histogram_example/test") # writer = tf.summary.FileWriter("/tmp/histogram_example") summaries = tf.summary.merge_all() # save st = np.array([]) ac_train = np.array([]) ca_train = np.array([]) auc_train = np.array([]) ac_test = np.array([]) ca_test = np.array([]) auc_test = np.array([]) batch_size = 100 for i in range(100): # train the whole epoch (first shuffle the data) idx = np.arange(0, n) np.random.shuffle(idx) X_shuffle = [data_train.X[k] for k in idx] labels_shuffle = [data_train.labels[k] for k in idx] for j in range(int(n / batch_size)): batch_xs = X_shuffle[j * batch_size:(j + 1) * batch_size - 1] batch_ys = labels_shuffle[j * batch_size:(j + 1) * batch_size - 1] sess.run(train_step, feed_dict={ x: batch_xs, y_: batch_ys, keep_prob: 0.5 }) # finish training, try on testing data if i % 10 is 0: print(i) soft_logits_train, summary_train, ca_train_i, ac_train_i, auc_train_i = sess.run( [ softmaxed_logits, summaries, cross_entropy, accuracy, auc_ftrain ], feed_dict={ x: data_train.X, y_: data_train.labels, keep_prob: 1.0 }) soft_logits_test, summary_test, ca_test_i, ac_test_i, auc_test_i = sess.run( [ softmaxed_logits, summaries, cross_entropy, accuracy, auc_ftest ], feed_dict={ x: data_test.X, y_: data_test.labels, keep_prob: 1.0 }) # [ca_test_i, ac_test_i,auc_test_i] = [0, 0, [0, 0]] #train_writer.add_summary(summary_train, i) #test_writer.add_summary(summary_test, i) # print (soft_logits_train) # print (data_train.labels) sk_auc_train = metrics.roc_auc_score( y_true=np.array(data_train.labels), y_score=np.array(soft_logits_train)) sk_auc_test = metrics.roc_auc_score( y_true=np.array(data_test.labels), y_score=np.array(soft_logits_test)) print('learning rate: ' + str(sess.run(learning_rate))) print('train cross entropy: ' + str(ca_train_i)) print('test cross entropy: ' + str(ca_test_i)) print('train accuracy: ' + str(ac_train_i)) print('test accuracy: ' + str(ac_test_i)) print('train auc: ' + str(auc_train_i[0])) print('test auc: ' + str(auc_test_i[0])) print('train sk auc: ' + str(sk_auc_train)) print('test sk auc: ' + str(sk_auc_test)) # print ('train auc sk' + str(auc_sk_train)) # print ('test auc sk' + str(auc_sk_test)) # ca_test, ac_test, auc_test = sess.run([cross_entropy, accuracy, auc], feed_dict={x: data_test.X, y_: data_test.labels, keep_prob: 1.0}) # print ('test cross entropy: ' + str(ca_test)) # print ('test accuracy: ' + str(ac_test)) # print ('test auc: '+ str(auc_test[0])) sess.close() sc.stop() end_time = time.time() print('run time: ' + str(round(end_time - st_time)) + ' seconds') print('tensorboard --logdir=/tmp/histogram_example') return 1
# Use a single-column SciPy csc_matrix as a sparse vector. sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) # Example 11-3 from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) # Create a labeled point with a negative label and a sparse feature vector. neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) # Example 11-5 from numpy import array from pyspark.mllib.linalg import Vectors # Create the dense vector <1.0, 2.0, 3.0> denseVec1 = array([1.0, 2.0, 3.0]) # NumPy arrays can be passed directly to MLlib denseVec2 = Vectors.dense([1.0, 2.0, 3.0]) # .. or you can use the Vectors class # Create the sparse vector <1.0, 0.0, 2.0, 0.0>; the methods for this take only
terms = tags.split() # filter words that not exist in the vocabulary terms = [x for x in list(set(terms)) if x in list(set(vocabulary))] indices = list(map(lambda x: vocabulary.index(x), list(set(terms)))) indices.sort() occurrences = list( map(lambda x: float(terms.count(vocabulary[x])), indices)) return [len(vocabulary), indices, occurrences] conf = SparkConf() conf.setAppName("NaiveBaye") conf.set('spark.driver.memory', '6g') conf.set('spark.executor.memory', '6g') conf.set('spark.cores.max', 156) #load tags passed as parameter tags = sys.argv[1] bow = bow(tags) #bag of words of that tags sc = SparkContext(conf=conf) # SparkContext model = NaiveBayesModel.load(sc, "model") result = model.predict(SparseVector(bow[0], bow[1], bow[2])) print str(classValues[result])
# Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print('model accuracy {}'.format(accuracy)) # Save and load model output_dir = 'output/' shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print('sameModel accuracy {}'.format(accuracy)) from pyspark.mllib.linalg import SparseVector testsparsevector = SparseVector(692, [5, 6], [5.0, 6.0]) print(sameModel.predict(testsparsevector)) # $example off$ sc.stop()
conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) sqlc = SQLContext(sc) print(StructField , "go go go") #df = sqlc.read.csv("hdfs://hadoop1:9000/home/hadoop/test.csv",header=True) sel = pd.read_csv("selldata/selldata") df = sqlc.read.csv("selldata/selldata",header=True) print(sel.columns) print(df) tran = df.rdd.map(lambda x:LabeledPoint(list(x)[9], SparseVector(7, [i for i in range(7)],list(x)[2:9]))) print(len(tran.collect())) #tran = data.rdd.map(lambda x:LabeledPoint(list(x)[17], SparseVector(16, [i for i in range(16)],list(x)[1:17]))) #model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10) #model.save(sc,"./gbdymodelonlionev1") # # a = StructType([ # StructField("ID",StringType(),False), # StructField("cust_type", StringType(), True), # StructField("cust_level", IntegerType(), True), # StructField("ID",StringType(),False),
def parseVector(line): _,indices_tuple_ls = line.split('\t') indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list. return SparseVector(TOTAL_DOCS,indices_tuple_ls)
def vectorizeUni(tokens): vector_dict = {} for w in tokens: vector_dict[dictionaryUni[w]] = 1 return SparseVector(len(dictionaryUni), vector_dict)
inputNum_min = float(finalDF1.select(min('inputNum').alias('min_inputNum')).collect()[0]['min_inputNum']) inputNum_max = float(finalDF1.select(max('inputNum').alias('max_inputNum')).collect()[0]['max_inputNum']) Min_v = inputNum_min Max_v = inputNum_max Norm_inputNum_function = udf(lambda v: (float(v) - Min_v) / (Max_v - Min_v), DoubleType()) finalDF2 = finalDF2.withColumn('Norm_inputNum', Norm_inputNum_function(finalDF2.inputNum)) %pyspark #conduct OneHotEncoder for project_index column from pyspark.ml.feature import OneHotEncoder finalDF2.registerTempTable("dfData") finalDF2 = spark.sql("SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData") encoder = OneHotEncoder(dropLast=False, inputCol="project_index", outputCol="project_Vec") encoded = encoder.transform(finalDF2) %pyspark encoded.registerTempTable("dfData") finalDF3 = spark.sql("SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData") from pyspark.mllib.linalg import SparseVector import numpy as np #824 should be revised according to your onehotcode result RDD = finalDF3.rdd.map(lambda line: SparseVector(824, line["project_Vec"].indices.tolist() + [821, 822, 823], line["project_Vec"].values.tolist() + [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache() %pyspark from pyspark.mllib.clustering import KMeans clusters = KMeans.train(RDD, 2, maxIterations=10, runs=10, initializationMode="k-means||")
def vectorizeBi(tokens): vector_dict = {} for w in tokens: vector_dict[dictionaryBigrams[w]] = 1 return SparseVector(len(dictionaryBigrams), vector_dict)
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) self._test_serialize(DenseVector(pyarray.array('d', range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
def vectorize(ratings, numMovies): return ratings.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: SparseVector(numMovies, x))
LabeledPoint(6.0, [3.0, 4.0]) ] # 训练集 lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, initialWeights=np.array([1.0, 1.0])) print(lrm.predict(np.array([2.0, 1.0]))) # 利用训练出的回归模型进行预测 import os, tempfile from pyspark.mllib.regression import LinearRegressionModel from pyspark.mllib.linalg import SparseVector path = tempfile.mkdtemp() lrm.save(sc, path) # 将模型保存至外存 sameModel = LinearRegressionModel.load(sc, path) # 读取模型 print(sameModel.predict(SparseVector(2, { 0: 100.0, 1: 150 }))) # 利用稀疏向量作为数据结构,返回单个预测值 test_set = [] for i in range(100): for j in range(100): test_set.append(SparseVector(2, {0: i, 1: j})) print(sameModel.predict(sc.parallelize(test_set)).collect()) # 预测多值,返回一个RDD数据集 print(sameModel.weights) # 返回参数 # -----------------岭回归------------------ from pyspark.mllib.regression import RidgeRegressionWithSGD data = [ LabeledPoint(1.0, [1.0, 1.0]), LabeledPoint(4.0, [1.0, 3.0]),
dv1 #array([ 2., 0., 5.]) # Sparse vector uses integer indices and double values. sv1 = Vectors.sparse(2, [0, 3], [5.0, 1.0]) sv1 #SparseVector(2, {0: 5.0, 3: 1.0}) # Labeled Point: This can be dense or Sparse vector with a label used in supervised learning. from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Labeled point with a positive label and a dense feature vector lp_pos = LabeledPoint(1.0, [4.0, 0.0, 2.0]) lp_pos # LabeledPoint(1.0, [4.0,0.0,2.0]) # Labeled point with a negative label and a sparse feature vector lp_neg = LabeledPoint(0.0, SparseVector(5, [1, 2], [3.0, 5.0])) lp_neg #LabeledPoint(0.0, (5,[1,2],[3.0,5.0])) # Local Matrix: This is a matrix with integer type indices and double type values. This is also stored on single machine. from pyspark.mllib.linalg import Matrix, Matrices # Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) # Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
def vectorizeBi(row,dico): vector_dict={} for w in row.bigrams: if w in dico: vector_dict[dico[w]]=1 return (row.label,SparseVector(len(dico),vector_dict))
pdline = pandas.read_csv("/Users/xiejinxin/datafloder/test/copy.csv") spkdf = sqlc.createDataFrame(pdline) print(pdline) print(spkdf) spkdf.createOrReplaceTempView("tmp1") data = sqlc.sql( "select row_number() over(order by relation_year) as id, cust_level,relation_year,sex,age,cust_status,is_xyk_kk,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,if(finance_bal > 0,1,0) as label from tmp1" ) data.show() tran = data.rdd.map(lambda x: LabeledPoint( list(x)[17], SparseVector(16, [i for i in range(16)], list(x)[1:17]))) #cust_level,relation_year,sex,age,eva_bal_rmb,cnt,transamt,liucun_bal #cust_level,relation_year,sex,age,cust_status,is_xyk_kk,degree,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,finance_bal,finance_bal_bb,finance_bal_fbb,invest_bal,ldjj_bal,gz_aum_bal,b_aum_bal,gold_bal,trust_bal,insurance_bal,third_bal,loan_house_bal,loan_car_bal,loan_mana_bal,loan_stuty_bal,loan_other_bal,ola_aum_bal,b_z_cd_aum_bal,loan_z_cd,zhc_aum_bal,jer_bal,dly_bal,hxlc_bal,jeqj_bal,jegd_bal,jewy_bal,dzzh_bal,decd_bal,xfc_aum_bal,jj_tot_vol,card_xy_bal_last_m_avg,card_xy_bal_last_m_avg_y,card_swing_bal_avg,card_swing_bal_avg_y,card_swing_num_avg,card_swing_num_avg_y,corpname,tran_amt_1m,tran_num_1m,tran_amt_3m,tran_num_3m,tran_amt_6m,tran_num_6m,day_cnt,tran_wy_amt_1m,tran_wy_num_1m,tran_wy_amt_3m,tran_wy_num_3m,tran_wy_amt_6m,tran_wy_num_6m,day_wy_cnt,tran_dz_amt_1m,tran_dz_num_1m,tran_dz_amt_3m,tran_dz_num_3m,tran_dz_amt_6m,tran_dz_num_6m,day_dz_cnt,tran_atm_amt_1m,tran_atm_num_1m,tran_atm_amt_3m,tran_atm_num_3m,tran_atm_amt_6m,tran_atm_num_6m,day_atm_cnt,tran_gt_amt_1m,tran_gt_num_1m,tran_gt_amt_3m,tran_gt_num_3m,tran_gt_amt_6m,tran_gt_num_6m,day_gt_cnt,tran_pos_amt_1m,tran_pos_num_1m,tran_pos_amt_3m,tran_pos_num_3m,tran_pos_amt_6m,tran_pos_num_6m,day_pos_cnt,tran_sj_amt_1m,tran_sj_num_1m,tran_sj_amt_3m,tran_sj_num_3m,tran_sj_amt_6m,tran_sj_num_6m,day_sj_cnt,tran_dh_amt_1m,tran_dh_num_1m,tran_dh_amt_3m,tran_dh_num_3m,tran_dh_amt_6m,tran_dh_num_6m,day_dh_cnt,is_despoit,is_fixed,is_finance,is_fund,is_gz_aum,is_insurance,is_gold,is_third,is_trust,is_loan,is_cbank,is_xyk,is_finance_bb,is_finance_fbb,is_ldjj,is_loan_house,is_loan_car,is_loan_mana,is_loan_stuty,is_loan_other,is_ola_aum,is_zhc_aum,is_jer,is_dly,is_hxlc,is_jeqj,is_jewy,is_decd,is_xfc_aum,' model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10) model.save(sc, "./gbdymodelonlionev1") a = StructType([ StructField("ID", StringType(), False), StructField("cust_type", StringType(), True), StructField("cust_level", IntegerType(), True) ]) print(a)