def parseHashPoint(point, numBuckets): """Create a LabeledPoint for this observation using hashing. Args: point (str): A comma separated string where the first value is the label and the rest are features. numBuckets: The number of buckets to hash to. Returns: LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed features. """ label = point.split(",")[0] unkeyed_features = point.split(",")[1:] index = 0 keyed_features = [] for feature in unkeyed_features: keyed_features.append((index, feature)) index += 1 features = hashFunction(numBuckets, keyed_features, True) features = SparseVector(numBuckets, sorted(features.keys()), features.values()) return LabeledPoint(label, features)
def test_squared_distance(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 3 lil[3, 0] = 2 dv = DenseVector(array([1., 2., 3., 4.])) sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4}) self.assertEqual(15.0, dv.squared_distance(lil)) self.assertEqual(15.0, sv.squared_distance(lil))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
def test_norms(self): a = DenseVector([0, 2, 3, -1]) self.assertAlmostEqual(a.norm(2), 3.742, 3) self.assertTrue(a.norm(1), 6) self.assertTrue(a.norm(inf), 3) a = SparseVector(4, [0, 2], [3, -4]) self.assertAlmostEqual(a.norm(2), 5) self.assertTrue(a.norm(1), 7) self.assertTrue(a.norm(inf), 4) tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1)
def f(champ): i = 0 newVects = [] while champ + i * (max(champions) + 1) < len(partialVect): newVect = SparseVector(len(partialVect), partialVect.indices, partialVect.values) newVect.indices = numpy.append(newVect.indices, [champ + i * (max(champions) + 1)]) newVect.values = numpy.append(newVect.values, [sign]) newVects.append(newVect) i += 1 return newVects
def test_parse_vector(self): a = DenseVector([]) self.assertEqual(str(a), '[]') self.assertEqual(Vectors.parse(str(a)), a) a = DenseVector([3, 4, 6, 7]) self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]') self.assertEqual(Vectors.parse(str(a)), a) a = SparseVector(4, [], []) self.assertEqual(str(a), '(4,[],[])') self.assertEqual(SparseVector.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])') self.assertEqual(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def test_parse_vector(self): a = DenseVector([3, 4, 6, 7]) self.assertTrue(str(a), "[3.0,4.0,6.0,7.0]") self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertTrue(str(a), "(4,[0,2],[3.0,4.0])") self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertTrue(SparseVector.parse(" (10, [0,1 ],[ 4.0,5.0] )"), a)
from pyspark.mllib.feature import HashingTF, IDF hashingTF = HashingTF() tf = hashingTF.transform(Positive_Reviews_body) idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf_T = tfidf \ .zipWithIndex() \ .flatMap(explode) \ .map(lambda x: (x[1], [x[0], x[2]])) \ .reduceByKey(lambda x, y: np.vstack([x, y])) \ .map(lambda x: (x[0], np.array(x[1]).reshape(-1, 2))) \ .map(lambda x: (x[0], x[1][x[1][:, 0].argsort()])) \ .map(lambda x: (x[0], SparseVector(num_Postive_Sentence, x[1][:, 0], x[1][:, 1]))) cosine_similarity = IndexedRowMatrix(tfidf_T).columnSimilarities() sim_matrix_full = cosine_similarity.entries \ .flatMap(lambda x: ((x.j, x.i, x.value), (x.i, x.j, x.value))) avg_dist_each = sim_matrix_full \ .map(lambda x: (x[0], (1, 1-x[2]))) \ .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \ .map(lambda x: (x[0], (x[1][1] + num_Postive_Sentence - x[1][0] - 1)/ (num_Postive_Sentence - 1))) avg_dist_each_vector = np.array(avg_dist_each.collect()) avg_dist_overall = np.mean(avg_dist_each_vector) center_index = avg_dist_each_vector[avg_dist_each_vector[:,
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.linalg import SparseVector from pyspark import SparkContext, SparkConf conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf=conf) sparse_data = [ LabeledPoint(0.0, SparseVector(2, {0: 1.0})), LabeledPoint(1.0, SparseVector(2, {1: 1.0})), LabeledPoint(0.0, SparseVector(2, {0: 1.0})), LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ] data = sc.parallelize(sparse_data) model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10) model.numTrees() model.totalNumNodes() model.predict(SparseVector(2, {1: 1.0})) model.predict(SparseVector(2, {0: 1.0})) rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) print(model.predict(rdd).collect()) model.save(sc, 'model')
from pyspark.ml.feature import OneHotEncoder finalDF2.registerTempTable("dfData") finalDF2 = spark.sql( "SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData" ) encoder = OneHotEncoder(dropLast=False, inputCol="project_index", outputCol="project_Vec") encoded = encoder.transform(finalDF2) encoded.registerTempTable("dfData") finalDF3 = spark.sql( "SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData") from pyspark.mllib.linalg import SparseVector import numpy as np #824 should be revised according to your onehotcode result RDD = finalDF3.rdd.map(lambda line: SparseVector( 824, line["project_Vec"].indices.tolist() + [821, 822, 823], line[ "project_Vec"].values.tolist() + [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache() from pyspark.mllib.clustering import KMeans clusters = KMeans.train(RDD, 2, maxIterations=10, runs=10, initializationMode="k-means||")
''' from pyspark.mllib.linalg import SparseVector from collections import Counter from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext('local', 'term_doc') corpus = sc.parallelize([ "It is the east, and Juliet is the sun.", "A dish fit for the gods.", "Brevity is the soul of wit." ]) tokens = corpus.map(lambda raw_text: raw_text.split()).cache() local_vocab_map = tokens.flatMap( lambda token: token).distinct().zipWithIndex().collectAsMap() vocab_map = sc.broadcast(local_vocab_map) vocab_size = sc.broadcast(len(local_vocab_map)) term_document_matrix = tokens \ .map(Counter) \ .map(lambda counts: {vocab_map.value[token]: float(counts[token]) for token in counts}) \ .map(lambda index_counts: SparseVector(vocab_size.value, index_counts)) for doc in term_document_matrix.collect(): print doc
# coding=utf-8 from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) print(pos) print(neg)
for v in values: if v in vocab[col]: word_indices.append(start_index + vocab[col].index(v)) for k, v in sorted(six.iteritems(Counter(word_indices))): feature_indices.append(k) feature_values.append(float(v)) start_index += len(vocab[col]) if col == target_col: label = vocab[col].index(col_value) if classification else col_value return {"label": label, "indices": feature_indices, "values": feature_values} return process_rows process_row_fn = make_process_rows_fn( classification, args.target, text_columns, category_columns, number_columns, vocab, stats) dfs = [] if args.train: dfTrain = spark.read.schema(schema).csv(args.train) dfs.append(("train", dfTrain)) if args.eval: dfEval = spark.read.schema(schema).csv(args.eval) dfs.append(("eval", dfEval)) for name, df in dfs: rdd = df.rdd.map(process_row_fn).map( lambda row: LabeledPoint(row["label"], SparseVector(feature_size, row["indices"], row["values"]))) MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))
def to_sparse_vector(t): return SparseVector(size, t)
def vectorizeUni(tokens): vector_dict = {} for w in tokens: vector_dict[dictionaryUni[w]] = 1 return SparseVector(len(dictionaryUni), vector_dict)
def main(): k_input_model = sys.argv[1] #read kmean model from this location w_input_model = sys.argv[2] #read word2vec model from this location input_file = sys.argv[3] #read input file conf = SparkConf().setAppName('Clustering') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load both kmean and Word2Vec model''' kmean_model = KMeansModel.load(sc,k_input_model) word2vec_model = Word2VecModel.load(sc,w_input_model) '''sbaronia - select fields from json and make data frame zipped with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(w_input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - here we create one vector per review, where vector contains the number of times a cluster is assinged to a word in a review. We make a SparseVector compatible format''' features = [] for i in range(len(clean_list)): histogram = [0] * 2000 for word in clean_list[i]: if word in keys_list: vec = word2vec_model.transform(word) clust = kmean_model.predict(vec) if histogram[clust] > 0: histogram[clust] = histogram[clust] + 1 else: histogram[clust] = 1 features.append((2000,range(2000),histogram)) '''sbaronia - create a normalized SparseVector rdd''' nor = Normalizer(1) features_rdd = rdd_zip(sc.parallelize(features) \ .map(lambda line: nor.transform(SparseVector.parse(line))) \ .cache()).cache() '''sbaronia - make a dataframe with rating, year and vector per review''' features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache() year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \ .drop(features_df.index).cache() '''sbaronia - create training and testing data based on year''' train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run LinearRegressionWithSGD with that step and report final RMSE''' step_best_norm = validation(train_rdd) RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm) print("Final RMSE(Normalization) = " + str(RMSE_norm) + " Best Step size = " + str(step_best_norm))
conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) sqlc = SQLContext(sc) print(StructField , "go go go") #df = sqlc.read.csv("hdfs://hadoop1:9000/home/hadoop/test.csv",header=True) sel = pd.read_csv("selldata/selldata") df = sqlc.read.csv("selldata/selldata",header=True) print(sel.columns) print(df) tran = df.rdd.map(lambda x:LabeledPoint(list(x)[9], SparseVector(7, [i for i in range(7)],list(x)[2:9]))) print(len(tran.collect())) #tran = data.rdd.map(lambda x:LabeledPoint(list(x)[17], SparseVector(16, [i for i in range(16)],list(x)[1:17]))) #model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10) #model.save(sc,"./gbdymodelonlionev1") # # a = StructType([ # StructField("ID",StringType(),False), # StructField("cust_type", StringType(), True), # StructField("cust_level", IntegerType(), True), # StructField("ID",StringType(),False),
terms = tags.split() # filter words that not exist in the vocabulary terms = [x for x in list(set(terms)) if x in list(set(vocabulary))] indices = list(map(lambda x: vocabulary.index(x), list(set(terms)))) indices.sort() occurrences = list( map(lambda x: float(terms.count(vocabulary[x])), indices)) return [len(vocabulary), indices, occurrences] conf = SparkConf() conf.setAppName("NaiveBaye") conf.set('spark.driver.memory', '6g') conf.set('spark.executor.memory', '6g') conf.set('spark.cores.max', 156) #load tags passed as parameter tags = sys.argv[1] bow = bow(tags) #bag of words of that tags sc = SparkContext(conf=conf) # SparkContext model = NaiveBayesModel.load(sc, "model") result = model.predict(SparseVector(bow[0], bow[1], bow[2])) print str(classValues[result])
def main(): st_time = time.time() train_percentage = 0.67 conf = (SparkConf().setMaster('local[*]').set( 'spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')) sc = SparkContext(conf=conf) if False: cid = 000000 # representing ps data # filename = 'ps_train.svm' # sc = SparkContext("local", "Simple App") # filename = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/models/2017-09-29/ps.51/training_set' filename = '../ps_data/ps_oct/training_set' # sc = SparkContext(conf=SparkConf().setAppName("ps_spark_grid") # conf = (SparkConf().set('spark.yarn.executor.memoryOverhead', '4096').set('spark.kryoserializer.buffer.max.mb', '2047').set('spark.driver.maxResultSize','2g')) data = sc.textFile(filename) # labels_sca = data.map(lambda x: int(x[0])) # int type labels_sca = data.map(lambda line: line.split(',')).map( lambda y: float(y[len(y) - 1])) nbr_samples = data.count() # nbr_samples = 10000 l_sca = np.array(labels_sca.take(nbr_samples)) #l, _ = fOnehot_encode(labels_sca.take(nbr_samples)) l = np.column_stack([np.array(l_sca), 1 - np.array(l_sca)]) # features = data.map(lambda x: x.split(' ')).map(lambda y: [int(y[i][-1]) for i in range(902)]) features = data.map(lambda line: line.split(',')).map( lambda y: [float(y[i]) for i in range(len(y) - 1)]) X = np.array(features.take(nbr_samples)) nbr_feature = len(X[0]) print('nbr of features: ' + str(nbr_feature)) # data_train, _ = fSplitTrainAndTest(X, l, l_sca, train_percentage) data_train, data_test = fSplitTrainAndTest(X, l, l_sca, train_percentage) ##### uncomment this if try using another testing set nbr_feature = 600 # filename_test_new = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/xg/test_data/2017-09-20/ps.51/part-r-01088' filename_test_new = '../ps_data/part-r-01088' new_data_test = sc.textFile(filename_test_new) nbr_samples_test = new_data_test.count() # nbr_samples_test = 10000 data2 = new_data_test.map(lambda line: line.split('\t')).map( lambda x: x[1]) labels = data2.map(lambda x: float(x[0])) feature_str = data2.map(lambda x: x[2:]) t2 = feature_str.map(lambda lines: lines.split(' ')) features = t2.map(lambda x: DenseVector( SparseVector(nbr_feature, {int(i.split(':')[0]): float(i.split(':')[1]) for i in x}))) l_sca_test = np.array(labels.take(nbr_samples_test)) l_test = np.column_stack([np.array(l_sca_test), 1 - np.array(l_sca_test)]) X_test = np.array(features.take(nbr_samples_test)) # data_test = Data(X_test, l_test, l_sca_test) data_train, data_test = fSplitTrainAndTest(X_test, l_test, l_sca_test, train_percentage) # # #### # data_train = Data(X, l, l_sca) n = len(data_train.X) # total number of training samples d = len(data_train.X[0]) # number of features ll = len(data_train.labels[0]) #output dimension # print (n) # print (d) # print (ll) # Create the model x = tf.placeholder(tf.float32, [None, d]) keep_prob = tf.placeholder(tf.float32) # if False: # y = deepnn(x, d, ll) # else: # y = deepnn_withBN(x, d, ll, 3, keep_prob) nbr_of_layers = 2 nbr_layer1 = 250 nbr_layer2 = 350 epsilon = 1e-3 x_drop = tf.nn.dropout(x, keep_prob) # adding dropout in the input layer # x_drop = x # no dropout on input layer W1 = weight_variable([d, nbr_layer1]) b1 = bias_variable([nbr_layer1]) z1 = tf.matmul(x_drop, W1) + b1 batch_mean1, batch_var1 = tf.nn.moments(z1, [0]) z1_hat = (z1 - batch_mean1) / tf.sqrt(batch_var1 + epsilon) scale1 = tf.Variable(tf.ones([nbr_layer1])) beta1 = tf.Variable(tf.zeros([nbr_layer1])) #b1 = bias_variable([nbr_layer1]) h1 = tf.nn.relu(scale1 * z1_hat + beta1) h1_drop = tf.nn.dropout(h1, keep_prob) if nbr_of_layers == 2: W2 = weight_variable([nbr_layer1, ll]) b2 = bias_variable([ll]) y = tf.matmul(h1_drop, W2) + b2 #h1 = tf.nn.sigmoid(scale1*z1_hat + beta1) else: W2 = weight_variable([nbr_layer1, nbr_layer2]) b2 = bias_variable([nbr_layer2]) z2 = tf.matmul(h1_drop, W2) + b2 batch_mean2, batch_var2 = tf.nn.moments(z2, [0]) z2_hat = (z2 - batch_mean2) / tf.sqrt(batch_var2 + epsilon) scale2 = tf.Variable(tf.ones([nbr_layer2])) beta2 = tf.Variable(tf.zeros([nbr_layer2])) h2 = tf.nn.relu(scale2 * z2_hat + beta2) h2_drop = tf.nn.dropout(h2, keep_prob) #h2 = tf.nn.sigmoid(scale2*z2_hat + beta2) W3 = weight_variable([nbr_layer2, ll]) b3 = bias_variable([ll]) y = tf.matmul(h2_drop, W3) + b3 # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, ll]) tf.summary.histogram('W1', W1) tf.summary.histogram('W2', W2) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) starter_learning_rate = 0.01 global_step = tf.Variable(0, trainable=False) # train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, decay_steps=5000, decay_rate=0.95, staircase=True, name=None) # train_step = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy, global_step = global_step) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cross_entropy, global_step=global_step) sess = tf.InteractiveSession() tf.global_variables_initializer().run() correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) auc_ftrain = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32), tf.cast(tf.argmax(y_, 1), tf.float32)) auc_ftest = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32), tf.cast(tf.argmax(y_, 1), tf.float32)) softmaxed_logits = tf.nn.softmax(y) tf.local_variables_initializer().run() sess.run(tf.initialize_local_variables()) tf.summary.scalar('cross_entropy', cross_entropy) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('auc_ftrain', auc_ftrain[0]) tf.summary.scalar('auc_ftest', auc_ftest[0]) train_writer = tf.summary.FileWriter("/tmp/histogram_example/train", sess.graph) test_writer = tf.summary.FileWriter("/tmp/histogram_example/test") # writer = tf.summary.FileWriter("/tmp/histogram_example") summaries = tf.summary.merge_all() # save st = np.array([]) ac_train = np.array([]) ca_train = np.array([]) auc_train = np.array([]) ac_test = np.array([]) ca_test = np.array([]) auc_test = np.array([]) batch_size = 100 for i in range(100): # train the whole epoch (first shuffle the data) idx = np.arange(0, n) np.random.shuffle(idx) X_shuffle = [data_train.X[k] for k in idx] labels_shuffle = [data_train.labels[k] for k in idx] for j in range(int(n / batch_size)): batch_xs = X_shuffle[j * batch_size:(j + 1) * batch_size - 1] batch_ys = labels_shuffle[j * batch_size:(j + 1) * batch_size - 1] sess.run(train_step, feed_dict={ x: batch_xs, y_: batch_ys, keep_prob: 0.5 }) # finish training, try on testing data if i % 10 is 0: print(i) soft_logits_train, summary_train, ca_train_i, ac_train_i, auc_train_i = sess.run( [ softmaxed_logits, summaries, cross_entropy, accuracy, auc_ftrain ], feed_dict={ x: data_train.X, y_: data_train.labels, keep_prob: 1.0 }) soft_logits_test, summary_test, ca_test_i, ac_test_i, auc_test_i = sess.run( [ softmaxed_logits, summaries, cross_entropy, accuracy, auc_ftest ], feed_dict={ x: data_test.X, y_: data_test.labels, keep_prob: 1.0 }) # [ca_test_i, ac_test_i,auc_test_i] = [0, 0, [0, 0]] #train_writer.add_summary(summary_train, i) #test_writer.add_summary(summary_test, i) # print (soft_logits_train) # print (data_train.labels) sk_auc_train = metrics.roc_auc_score( y_true=np.array(data_train.labels), y_score=np.array(soft_logits_train)) sk_auc_test = metrics.roc_auc_score( y_true=np.array(data_test.labels), y_score=np.array(soft_logits_test)) print('learning rate: ' + str(sess.run(learning_rate))) print('train cross entropy: ' + str(ca_train_i)) print('test cross entropy: ' + str(ca_test_i)) print('train accuracy: ' + str(ac_train_i)) print('test accuracy: ' + str(ac_test_i)) print('train auc: ' + str(auc_train_i[0])) print('test auc: ' + str(auc_test_i[0])) print('train sk auc: ' + str(sk_auc_train)) print('test sk auc: ' + str(sk_auc_test)) # print ('train auc sk' + str(auc_sk_train)) # print ('test auc sk' + str(auc_sk_test)) # ca_test, ac_test, auc_test = sess.run([cross_entropy, accuracy, auc], feed_dict={x: data_test.X, y_: data_test.labels, keep_prob: 1.0}) # print ('test cross entropy: ' + str(ca_test)) # print ('test accuracy: ' + str(ac_test)) # print ('test auc: '+ str(auc_test[0])) sess.close() sc.stop() end_time = time.time() print('run time: ' + str(round(end_time - st_time)) + ' seconds') print('tensorboard --logdir=/tmp/histogram_example') return 1
import numpy as np from scipy.sparse import csr_matrix from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint M = csr_matrix([[4, 1, 0], [4, 0, 3], [0, 0, 1]]) label = 0.0 point = LabeledPoint(label, SparseVector(3, [0, 2], [1.0, 3.0])) textRDD = sc.textFile("README.md") print textRDD.count()
def to_labeledpoint(line): line_spl = line.split(' :: ') return LabeledPoint(line_spl[0], SparseVector.parse(line_spl[1]))
def change_labelPoint(rdd, total_feas): label_point = LabeledPoint( rdd['label'], SparseVector(total_feas, rdd['pos'], rdd['val'])) return label_point
def vectorizeBi(tokens): vector_dict = {} for w in tokens: vector_dict[dictionaryBigrams[w]] = 1 return SparseVector(len(dictionaryBigrams), vector_dict)
indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) #print(cars.toPandas().sample(12)) # Check column data types print('\n', cars.dtypes, '\n') kars = cars.select('name', 'type', 'type_idx') print(kars.toPandas().sample(12)) onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy']) onehot = onehot.fit(kars) kars = onehot.transform(kars) kars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show() print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0])) print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0})) spark.stop()
def vectorize(ratings, numMovies): return ratings.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: SparseVector(numMovies, x))
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) self._test_serialize(DenseVector(pyarray.array('d', range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
dv1 #array([ 2., 0., 5.]) # Sparse vector uses integer indices and double values. sv1 = Vectors.sparse(2, [0, 3], [5.0, 1.0]) sv1 #SparseVector(2, {0: 5.0, 3: 1.0}) # Labeled Point: This can be dense or Sparse vector with a label used in supervised learning. from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Labeled point with a positive label and a dense feature vector lp_pos = LabeledPoint(1.0, [4.0, 0.0, 2.0]) lp_pos # LabeledPoint(1.0, [4.0,0.0,2.0]) # Labeled point with a negative label and a sparse feature vector lp_neg = LabeledPoint(0.0, SparseVector(5, [1, 2], [3.0, 5.0])) lp_neg #LabeledPoint(0.0, (5,[1,2],[3.0,5.0])) # Local Matrix: This is a matrix with integer type indices and double type values. This is also stored on single machine. from pyspark.mllib.linalg import Matrix, Matrices # Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) # Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
LabeledPoint(6.0, [3.0, 4.0]) ] # 训练集 lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, initialWeights=np.array([1.0, 1.0])) print(lrm.predict(np.array([2.0, 1.0]))) # 利用训练出的回归模型进行预测 import os, tempfile from pyspark.mllib.regression import LinearRegressionModel from pyspark.mllib.linalg import SparseVector path = tempfile.mkdtemp() lrm.save(sc, path) # 将模型保存至外存 sameModel = LinearRegressionModel.load(sc, path) # 读取模型 print(sameModel.predict(SparseVector(2, { 0: 100.0, 1: 150 }))) # 利用稀疏向量作为数据结构,返回单个预测值 test_set = [] for i in range(100): for j in range(100): test_set.append(SparseVector(2, {0: i, 1: j})) print(sameModel.predict(sc.parallelize(test_set)).collect()) # 预测多值,返回一个RDD数据集 print(sameModel.weights) # 返回参数 # -----------------岭回归------------------ from pyspark.mllib.regression import RidgeRegressionWithSGD data = [ LabeledPoint(1.0, [1.0, 1.0]), LabeledPoint(4.0, [1.0, 3.0]),
sampleOHEDictManual[(2, 'salmon')] = 6 # COMMAND ---------- # MAGIC %md #### ** (1b) Sparse vectors ** # MAGIC #### Data points can typically be represented with a small number of non-zero OHE features relative to the total number of features that occur in the dataset. By leveraging this sparsity and using sparse vector representations of OHE data, we can reduce storage and computational burdens. Below are a few sample vectors represented as dense numpy arrays. # COMMAND ---------- import numpy as np from pyspark.mllib.linalg import SparseVector # COMMAND ---------- aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [1, 3], [3., 4.]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [3], [1.]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w) print bDense.dot(w) print bSparse.dot(w) # COMMAND ---------- # MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0. For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`. # COMMAND ----------
def vectorizeBi(row,dico): vector_dict={} for w in row.bigrams: if w in dico: vector_dict[dico[w]]=1 return (row.label,SparseVector(len(dico),vector_dict))
# print (Vectors.sparse(row[0][0],row[0][1],row[0][2]) ) print(row) break exit() """ # zipping the word with its probability distribution for that topic termsRDD = topicsRDD.map(lambda topic: (zip(itemgetter(*topic[0])(vocablist), topic[1]))) zippedRDD = topicsRDD.map(lambda topic: (zip(topic[0], topic[1]))) # for Every topic, sparse vector of distribution over words docCalcs = zippedRDD.map(lambda topic: DenseVector( (SparseVector(vocabSize, topic)).toArray())) #schema = StructType([StructField("topicwordDistribution", Vector(), False)]) #df = sqlContext.applySchema(docCalcs, schema) #docCalcs = docCalcs.map(lambda l: Row(l)) docCalcs = docCalcs.zipWithIndex() #docCalcs = docCalcs.collect() docCalcs = sqlContext.createDataFrame(docCalcs, ['topicwordDistribution', 'topicId']) #print(type(docCalcs))
def get_training_vector(classification, term_list, classifications, number_of_terms): clss = 1 if classification in classifications else 0 return LabeledPoint(clss, SparseVector(number_of_terms, term_list))
# Use a single-column SciPy csc_matrix as a sparse vector. sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1)) # Example 11-3 from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) # Create a labeled point with a negative label and a sparse feature vector. neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) # Example 11-5 from numpy import array from pyspark.mllib.linalg import Vectors # Create the dense vector <1.0, 2.0, 3.0> denseVec1 = array([1.0, 2.0, 3.0]) # NumPy arrays can be passed directly to MLlib denseVec2 = Vectors.dense([1.0, 2.0, 3.0]) # .. or you can use the Vectors class # Create the sparse vector <1.0, 0.0, 2.0, 0.0>; the methods for this take only
def test_sparse_vector_iteration(self): self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0]) self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
# Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print('model accuracy {}'.format(accuracy)) # Save and load model output_dir = 'output/' shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() print('sameModel accuracy {}'.format(accuracy)) from pyspark.mllib.linalg import SparseVector testsparsevector = SparseVector(692, [5, 6], [5.0, 6.0]) print(sameModel.predict(testsparsevector)) # $example off$ sc.stop()
Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')], 'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4', "incorrect value for sampleOHEDictManual[(2,'mouse')]") Test.assertEqualsHashed(sampleOHEDictManual[(2,'salmon')], 'c1dfd96eea8cc2b62785275bca38ac261256e278', "incorrect value for sampleOHEDictManual[(2,'salmon')]") Test.assertEquals(len(sampleOHEDictManual.keys()), 7, 'incorrect number of keys in sampleOHEDictManual') # ** Sparse vectors ** import numpy as np from pyspark.mllib.linalg import SparseVector aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w) print bDense.dot(w) print bSparse.dot(w) # TEST Sparse Vectors Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(aDense.dot(w) == aSparse.dot(w),
class VectorUDTTests(MLlibTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): rdd = self.sc.parallelize( [LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) def test_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(Vectors.dense(1))]) row_matrix = RowMatrix(df) self.assertEqual(row_matrix.numRows(), 1) self.assertEqual(row_matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): RowMatrix(df.selectExpr("'monkey'")) def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1")) def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize( [IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def normalized_labeledpoint(line,nor): line_spl = line.split(' :: ') return LabeledPoint(line_spl[0], nor.transform(SparseVector.parse(line_spl[1])))
def parseVector(line): _,indices_tuple_ls = line.split('\t') indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list. return SparseVector(TOTAL_DOCS,indices_tuple_ls)
# Created by Raju Kumar Mishra # Book PySpark Recipes # Chapter 9 # Recipe 9-2. Create a Sparse Vector. # Run following PySpark code lines, line by line in PySpark shell from pyspark.mllib.linalg import SparseVector sparseDataList = [1.0, 3.2] sparseDataVector = SparseVector(8, [0, 7], sparseDataList) sparseDataVector sparseDataVector[1] sparseDataVector[7] sparseDataVector.numNonzeros() sparseDataList1 = [3.0, 1.4, 2.5, 1.2] sparseDataVector1 = SparseVector(8, [0, 3, 4, 6], sparseDataList1) squaredDistance = sparseDataVector.squared_distance(sparseDataVector1) squaredDistance
def add_sparse_vector(vec1, vec2): t = vec1.toArray() + vec2.toArray() idx, val = get_sparse_index(t) return SparseVector(NUMBER_OF_CATEGORY, idx, val)