def parseHashPoint(point, numBuckets):
    """Create a LabeledPoint for this observation using hashing.

    Args:
        point (str): A comma separated string where the first value is the label and the rest are
            features.
        numBuckets: The number of buckets to hash to.

    Returns:
        LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
            features.
    """
    label = point.split(",")[0]

    unkeyed_features = point.split(",")[1:]
    
    index = 0
    keyed_features = []
    for feature in unkeyed_features:
      keyed_features.append((index, feature))
      index += 1
    
    features = hashFunction(numBuckets, keyed_features, True)
    features = SparseVector(numBuckets, sorted(features.keys()), features.values())
    
    return LabeledPoint(label, features)
示例#2
0
 def test_squared_distance(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 3
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
     self.assertEqual(15.0, dv.squared_distance(lil))
     self.assertEqual(15.0, sv.squared_distance(lil))
示例#3
0
文件: tests.py 项目: vidur89/spark
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
示例#4
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
示例#5
0
	def f(champ):
		i = 0
		newVects = []

		while champ + i * (max(champions) + 1) < len(partialVect):
			newVect = SparseVector(len(partialVect), partialVect.indices, partialVect.values)
			newVect.indices = numpy.append(newVect.indices, [champ + i * (max(champions) + 1)])
			newVect.values = numpy.append(newVect.values, [sign])
			newVects.append(newVect)
			i += 1

		return newVects
示例#6
0
 def test_parse_vector(self):
     a = DenseVector([])
     self.assertEqual(str(a), '[]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = DenseVector([3, 4, 6, 7])
     self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(4, [], [])
     self.assertEqual(str(a), '(4,[],[])')
     self.assertEqual(SparseVector.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertEqual(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
示例#7
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
示例#8
0
 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), "[3.0,4.0,6.0,7.0]")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), "(4,[0,2],[3.0,4.0])")
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(" (10, [0,1 ],[ 4.0,5.0] )"), a)
示例#9
0
    from pyspark.mllib.feature import HashingTF, IDF

    hashingTF = HashingTF()
    tf = hashingTF.transform(Positive_Reviews_body)

    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    tfidf_T = tfidf \
        .zipWithIndex() \
        .flatMap(explode) \
        .map(lambda x: (x[1], [x[0], x[2]])) \
        .reduceByKey(lambda x, y: np.vstack([x, y])) \
        .map(lambda x: (x[0], np.array(x[1]).reshape(-1, 2))) \
        .map(lambda x: (x[0], x[1][x[1][:, 0].argsort()])) \
        .map(lambda x: (x[0], SparseVector(num_Postive_Sentence, x[1][:, 0], x[1][:, 1])))

    cosine_similarity = IndexedRowMatrix(tfidf_T).columnSimilarities()

    sim_matrix_full = cosine_similarity.entries \
        .flatMap(lambda x: ((x.j, x.i, x.value), (x.i, x.j, x.value)))

    avg_dist_each = sim_matrix_full \
        .map(lambda x: (x[0], (1, 1-x[2]))) \
        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
        .map(lambda x: (x[0], (x[1][1] + num_Postive_Sentence - x[1][0] - 1)/ (num_Postive_Sentence - 1)))

    avg_dist_each_vector = np.array(avg_dist_each.collect())
    avg_dist_overall = np.mean(avg_dist_each_vector)

    center_index = avg_dist_each_vector[avg_dist_each_vector[:,
示例#10
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)

sparse_data = [
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
]

data = sc.parallelize(sparse_data)

model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10)
model.numTrees()

model.totalNumNodes()

model.predict(SparseVector(2, {1: 1.0}))

model.predict(SparseVector(2, {0: 1.0}))

rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
print(model.predict(rdd).collect())

model.save(sc, 'model')
示例#11
0
from pyspark.ml.feature import OneHotEncoder

finalDF2.registerTempTable("dfData")
finalDF2 = spark.sql(
    "SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData"
)

encoder = OneHotEncoder(dropLast=False,
                        inputCol="project_index",
                        outputCol="project_Vec")
encoded = encoder.transform(finalDF2)

encoded.registerTempTable("dfData")
finalDF3 = spark.sql(
    "SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData")

from pyspark.mllib.linalg import SparseVector
import numpy as np
#824 should be revised according to your onehotcode result
RDD = finalDF3.rdd.map(lambda line: SparseVector(
    824, line["project_Vec"].indices.tolist() + [821, 822, 823], line[
        "project_Vec"].values.tolist() +
    [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache()

from pyspark.mllib.clustering import KMeans

clusters = KMeans.train(RDD,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode="k-means||")
示例#12
0
'''

from pyspark.mllib.linalg import SparseVector
from collections import Counter

from pyspark import SparkContext

if __name__ == "__main__":

    sc = SparkContext('local', 'term_doc')
    corpus = sc.parallelize([
        "It is the east, and Juliet is the sun.", "A dish fit for the gods.",
        "Brevity is the soul of wit."
    ])

    tokens = corpus.map(lambda raw_text: raw_text.split()).cache()
    local_vocab_map = tokens.flatMap(
        lambda token: token).distinct().zipWithIndex().collectAsMap()

    vocab_map = sc.broadcast(local_vocab_map)
    vocab_size = sc.broadcast(len(local_vocab_map))

    term_document_matrix = tokens \
                         .map(Counter) \
                         .map(lambda counts: {vocab_map.value[token]: float(counts[token]) for token in counts}) \
                         .map(lambda index_counts: SparseVector(vocab_size.value, index_counts))

    for doc in term_document_matrix.collect():
        print doc
示例#13
0
# coding=utf-8

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print(pos)
print(neg)
示例#14
0
          for v in values:
            if v in vocab[col]:
              word_indices.append(start_index + vocab[col].index(v))
          for k, v in sorted(six.iteritems(Counter(word_indices))):
            feature_indices.append(k)
            feature_values.append(float(v))
        start_index += len(vocab[col])
      if col == target_col:
        label = vocab[col].index(col_value) if classification else col_value
    return {"label": label, "indices": feature_indices, "values": feature_values}
  
  return process_rows


process_row_fn = make_process_rows_fn(
    classification, args.target, text_columns, category_columns, number_columns, vocab, stats)

dfs = []
if args.train:
  dfTrain = spark.read.schema(schema).csv(args.train)
  dfs.append(("train", dfTrain))
if args.eval:
  dfEval = spark.read.schema(schema).csv(args.eval)
  dfs.append(("eval", dfEval))

for name, df in dfs:
  rdd = df.rdd.map(process_row_fn).map(
      lambda row: LabeledPoint(row["label"],
                               SparseVector(feature_size, row["indices"], row["values"])))
  MLUtils.saveAsLibSVMFile(rdd, os.path.join(args.output, name))
示例#15
0
def to_sparse_vector(t):
    return SparseVector(size, t)
def vectorizeUni(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryUni[w]] = 1
    return SparseVector(len(dictionaryUni), vector_dict)
示例#17
0
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
示例#18
0
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
sqlc = SQLContext(sc)

print(StructField , "go go go")

#df = sqlc.read.csv("hdfs://hadoop1:9000/home/hadoop/test.csv",header=True)

sel = pd.read_csv("selldata/selldata")

df = sqlc.read.csv("selldata/selldata",header=True)
print(sel.columns)
print(df)

tran = df.rdd.map(lambda x:LabeledPoint(list(x)[9], SparseVector(7, [i for i in range(7)],list(x)[2:9])))

print(len(tran.collect()))


#tran = data.rdd.map(lambda x:LabeledPoint(list(x)[17], SparseVector(16, [i for i in range(16)],list(x)[1:17])))

#model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10)

#model.save(sc,"./gbdymodelonlionev1")
#
# a = StructType([
#     StructField("ID",StringType(),False),
#     StructField("cust_type", StringType(), True),
#     StructField("cust_level", IntegerType(), True),
#     StructField("ID",StringType(),False),
示例#19
0
    terms = tags.split()

    # filter words that not exist in the vocabulary
    terms = [x for x in list(set(terms)) if x in list(set(vocabulary))]

    indices = list(map(lambda x: vocabulary.index(x), list(set(terms))))
    indices.sort()
    occurrences = list(
        map(lambda x: float(terms.count(vocabulary[x])), indices))

    return [len(vocabulary), indices, occurrences]


conf = SparkConf()
conf.setAppName("NaiveBaye")
conf.set('spark.driver.memory', '6g')
conf.set('spark.executor.memory', '6g')
conf.set('spark.cores.max', 156)

#load tags passed as parameter
tags = sys.argv[1]
bow = bow(tags)  #bag of words of that tags

sc = SparkContext(conf=conf)  # SparkContext

model = NaiveBayesModel.load(sc, "model")

result = model.predict(SparseVector(bow[0], bow[1], bow[2]))

print str(classValues[result])
def main():
    st_time = time.time()

    train_percentage = 0.67
    conf = (SparkConf().setMaster('local[*]').set(
        'spark.executor.memory',
        '4G').set('spark.driver.memory',
                  '45G').set('spark.driver.maxResultSize', '10G'))
    sc = SparkContext(conf=conf)
    if False:
        cid = 000000  # representing ps data
        # filename = 'ps_train.svm'
        # sc = SparkContext("local", "Simple App")
        # filename = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/models/2017-09-29/ps.51/training_set'
        filename = '../ps_data/ps_oct/training_set'
        # sc = SparkContext(conf=SparkConf().setAppName("ps_spark_grid")
        # conf = (SparkConf().set('spark.yarn.executor.memoryOverhead', '4096').set('spark.kryoserializer.buffer.max.mb', '2047').set('spark.driver.maxResultSize','2g'))

        data = sc.textFile(filename)
        # labels_sca = data.map(lambda x: int(x[0])) # int type
        labels_sca = data.map(lambda line: line.split(',')).map(
            lambda y: float(y[len(y) - 1]))
        nbr_samples = data.count()
        # nbr_samples = 10000
        l_sca = np.array(labels_sca.take(nbr_samples))
        #l, _ = fOnehot_encode(labels_sca.take(nbr_samples))
        l = np.column_stack([np.array(l_sca), 1 - np.array(l_sca)])

        # features = data.map(lambda x: x.split(' ')).map(lambda y: [int(y[i][-1]) for i in range(902)])
        features = data.map(lambda line: line.split(',')).map(
            lambda y: [float(y[i]) for i in range(len(y) - 1)])
        X = np.array(features.take(nbr_samples))
        nbr_feature = len(X[0])
        print('nbr of features: ' + str(nbr_feature))

        # data_train, _ = fSplitTrainAndTest(X, l, l_sca, train_percentage)
        data_train, data_test = fSplitTrainAndTest(X, l, l_sca,
                                                   train_percentage)

    ##### uncomment this if try using another testing set
    nbr_feature = 600
    # filename_test_new = 'hdfs://jetblue-nn1.blue.ygrid.yahoo.com:8020/projects/predseg/xg/test_data/2017-09-20/ps.51/part-r-01088'
    filename_test_new = '../ps_data/part-r-01088'
    new_data_test = sc.textFile(filename_test_new)
    nbr_samples_test = new_data_test.count()
    # nbr_samples_test = 10000
    data2 = new_data_test.map(lambda line: line.split('\t')).map(
        lambda x: x[1])
    labels = data2.map(lambda x: float(x[0]))
    feature_str = data2.map(lambda x: x[2:])
    t2 = feature_str.map(lambda lines: lines.split(' '))
    features = t2.map(lambda x: DenseVector(
        SparseVector(nbr_feature,
                     {int(i.split(':')[0]): float(i.split(':')[1])
                      for i in x})))
    l_sca_test = np.array(labels.take(nbr_samples_test))
    l_test = np.column_stack([np.array(l_sca_test), 1 - np.array(l_sca_test)])
    X_test = np.array(features.take(nbr_samples_test))
    # data_test = Data(X_test, l_test, l_sca_test)

    data_train, data_test = fSplitTrainAndTest(X_test, l_test, l_sca_test,
                                               train_percentage)
    # # ####

    # data_train = Data(X, l, l_sca)
    n = len(data_train.X)  # total number of training samples
    d = len(data_train.X[0])  # number of features
    ll = len(data_train.labels[0])  #output dimension
    # print (n)
    # print (d)
    # print (ll)

    # Create the model
    x = tf.placeholder(tf.float32, [None, d])
    keep_prob = tf.placeholder(tf.float32)

    # if False:
    # 	y = deepnn(x, d, ll)
    # else:
    # y = deepnn_withBN(x, d, ll, 3, keep_prob)
    nbr_of_layers = 2
    nbr_layer1 = 250
    nbr_layer2 = 350
    epsilon = 1e-3

    x_drop = tf.nn.dropout(x, keep_prob)  # adding dropout in the input layer
    # x_drop = x # no dropout on input layer
    W1 = weight_variable([d, nbr_layer1])
    b1 = bias_variable([nbr_layer1])
    z1 = tf.matmul(x_drop, W1) + b1
    batch_mean1, batch_var1 = tf.nn.moments(z1, [0])
    z1_hat = (z1 - batch_mean1) / tf.sqrt(batch_var1 + epsilon)
    scale1 = tf.Variable(tf.ones([nbr_layer1]))
    beta1 = tf.Variable(tf.zeros([nbr_layer1]))
    #b1 = bias_variable([nbr_layer1])
    h1 = tf.nn.relu(scale1 * z1_hat + beta1)
    h1_drop = tf.nn.dropout(h1, keep_prob)
    if nbr_of_layers == 2:

        W2 = weight_variable([nbr_layer1, ll])
        b2 = bias_variable([ll])
        y = tf.matmul(h1_drop, W2) + b2
    #h1 = tf.nn.sigmoid(scale1*z1_hat + beta1)
    else:
        W2 = weight_variable([nbr_layer1, nbr_layer2])
        b2 = bias_variable([nbr_layer2])
        z2 = tf.matmul(h1_drop, W2) + b2
        batch_mean2, batch_var2 = tf.nn.moments(z2, [0])
        z2_hat = (z2 - batch_mean2) / tf.sqrt(batch_var2 + epsilon)
        scale2 = tf.Variable(tf.ones([nbr_layer2]))
        beta2 = tf.Variable(tf.zeros([nbr_layer2]))
        h2 = tf.nn.relu(scale2 * z2_hat + beta2)
        h2_drop = tf.nn.dropout(h2, keep_prob)
        #h2 = tf.nn.sigmoid(scale2*z2_hat + beta2)

        W3 = weight_variable([nbr_layer2, ll])
        b3 = bias_variable([ll])

        y = tf.matmul(h2_drop, W3) + b3

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, ll])

    tf.summary.histogram('W1', W1)
    tf.summary.histogram('W2', W2)
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    starter_learning_rate = 0.01
    global_step = tf.Variable(0, trainable=False)
    # train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
    learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                               global_step,
                                               decay_steps=5000,
                                               decay_rate=0.95,
                                               staircase=True,
                                               name=None)
    # train_step = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cross_entropy, global_step = global_step)

    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        cross_entropy, global_step=global_step)
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    auc_ftrain = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                                tf.cast(tf.argmax(y_, 1), tf.float32))
    auc_ftest = tf.metrics.auc(tf.cast(tf.argmax(y, 1), tf.float32),
                               tf.cast(tf.argmax(y_, 1), tf.float32))
    softmaxed_logits = tf.nn.softmax(y)

    tf.local_variables_initializer().run()
    sess.run(tf.initialize_local_variables())
    tf.summary.scalar('cross_entropy', cross_entropy)
    tf.summary.scalar('accuracy', accuracy)
    tf.summary.scalar('auc_ftrain', auc_ftrain[0])
    tf.summary.scalar('auc_ftest', auc_ftest[0])

    train_writer = tf.summary.FileWriter("/tmp/histogram_example/train",
                                         sess.graph)
    test_writer = tf.summary.FileWriter("/tmp/histogram_example/test")

    # writer = tf.summary.FileWriter("/tmp/histogram_example")
    summaries = tf.summary.merge_all()
    # save
    st = np.array([])

    ac_train = np.array([])
    ca_train = np.array([])
    auc_train = np.array([])

    ac_test = np.array([])
    ca_test = np.array([])
    auc_test = np.array([])

    batch_size = 100

    for i in range(100):

        # train the whole epoch (first shuffle the data)
        idx = np.arange(0, n)
        np.random.shuffle(idx)
        X_shuffle = [data_train.X[k] for k in idx]
        labels_shuffle = [data_train.labels[k] for k in idx]

        for j in range(int(n / batch_size)):
            batch_xs = X_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            batch_ys = labels_shuffle[j * batch_size:(j + 1) * batch_size - 1]
            sess.run(train_step,
                     feed_dict={
                         x: batch_xs,
                         y_: batch_ys,
                         keep_prob: 0.5
                     })

        # finish training, try on testing data
        if i % 10 is 0:
            print(i)

            soft_logits_train, summary_train, ca_train_i, ac_train_i, auc_train_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftrain
                ],
                feed_dict={
                    x: data_train.X,
                    y_: data_train.labels,
                    keep_prob: 1.0
                })

            soft_logits_test, summary_test, ca_test_i, ac_test_i, auc_test_i = sess.run(
                [
                    softmaxed_logits, summaries, cross_entropy, accuracy,
                    auc_ftest
                ],
                feed_dict={
                    x: data_test.X,
                    y_: data_test.labels,
                    keep_prob: 1.0
                })
            # [ca_test_i, ac_test_i,auc_test_i] = [0, 0, [0, 0]]
            #train_writer.add_summary(summary_train, i)
            #test_writer.add_summary(summary_test, i)

            # print (soft_logits_train)
            # print (data_train.labels)
            sk_auc_train = metrics.roc_auc_score(
                y_true=np.array(data_train.labels),
                y_score=np.array(soft_logits_train))
            sk_auc_test = metrics.roc_auc_score(
                y_true=np.array(data_test.labels),
                y_score=np.array(soft_logits_test))
            print('learning rate: ' + str(sess.run(learning_rate)))

            print('train cross entropy: ' + str(ca_train_i))
            print('test cross entropy: ' + str(ca_test_i))

            print('train accuracy: ' + str(ac_train_i))
            print('test accuracy: ' + str(ac_test_i))

            print('train auc: ' + str(auc_train_i[0]))
            print('test auc: ' + str(auc_test_i[0]))

            print('train sk auc: ' + str(sk_auc_train))
            print('test sk auc: ' + str(sk_auc_test))
            # print ('train auc sk' + str(auc_sk_train))
            # print ('test auc sk' + str(auc_sk_test))

        # ca_test, ac_test, auc_test = sess.run([cross_entropy, accuracy, auc], feed_dict={x: data_test.X, y_: data_test.labels, keep_prob: 1.0})
        # print ('test cross entropy: ' + str(ca_test))
        # print ('test accuracy: ' + str(ac_test))
        # print ('test auc: '+ str(auc_test[0]))
    sess.close()
    sc.stop()

    end_time = time.time()
    print('run time: ' + str(round(end_time - st_time)) + ' seconds')
    print('tensorboard --logdir=/tmp/histogram_example')
    return 1
示例#21
0
import numpy as np
from scipy.sparse import csr_matrix
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

M = csr_matrix([[4, 1, 0], [4, 0, 3], [0, 0, 1]])
label = 0.0
point = LabeledPoint(label, SparseVector(3, [0, 2], [1.0, 3.0]))
 
textRDD = sc.textFile("README.md")
print textRDD.count()
示例#22
0
def to_labeledpoint(line):
    line_spl = line.split(' :: ')
    return LabeledPoint(line_spl[0], SparseVector.parse(line_spl[1]))
示例#23
0
 def change_labelPoint(rdd, total_feas):
     label_point = LabeledPoint(
         rdd['label'], SparseVector(total_feas, rdd['pos'], rdd['val']))
     return label_point
def vectorizeBi(tokens):
    vector_dict = {}
    for w in tokens:
        vector_dict[dictionaryBigrams[w]] = 1
    return SparseVector(len(dictionaryBigrams), vector_dict)
示例#25
0
indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'type', 'type_idx')

print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])
onehot = onehot.fit(kars)
kars = onehot.transform(kars)
kars.select('type', 'type_idx',
            'type_dummy').distinct().sort('type_idx').show()

print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0]))
print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0}))

spark.stop()
示例#26
0
def vectorize(ratings, numMovies):
    return ratings.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: SparseVector(numMovies, x))
示例#27
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
示例#28
0
dv1
#array([ 2.,  0.,  5.])

# Sparse vector uses integer indices and double values.

sv1 = Vectors.sparse(2, [0, 3], [5.0, 1.0])
sv1
#SparseVector(2, {0: 5.0, 3: 1.0})

# Labeled Point:  This can be dense or Sparse vector with a label used in supervised learning.
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Labeled point with a positive label and a dense feature vector
lp_pos = LabeledPoint(1.0, [4.0, 0.0, 2.0])
lp_pos
# LabeledPoint(1.0, [4.0,0.0,2.0])

# Labeled point with a negative label and a sparse feature vector
lp_neg = LabeledPoint(0.0, SparseVector(5, [1, 2], [3.0, 5.0]))
lp_neg
#LabeledPoint(0.0, (5,[1,2],[3.0,5.0]))

# Local Matrix:  This is a matrix with integer type indices and double type values.  This is also stored on single machine.
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])
# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
示例#29
0
    LabeledPoint(6.0, [3.0, 4.0])
]  # 训练集
lrm = LinearRegressionWithSGD.train(sc.parallelize(data),
                                    iterations=100,
                                    initialWeights=np.array([1.0, 1.0]))
print(lrm.predict(np.array([2.0, 1.0])))  # 利用训练出的回归模型进行预测

import os, tempfile
from pyspark.mllib.regression import LinearRegressionModel
from pyspark.mllib.linalg import SparseVector

path = tempfile.mkdtemp()
lrm.save(sc, path)  # 将模型保存至外存
sameModel = LinearRegressionModel.load(sc, path)  # 读取模型
print(sameModel.predict(SparseVector(2, {
    0: 100.0,
    1: 150
})))  # 利用稀疏向量作为数据结构,返回单个预测值
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(SparseVector(2, {0: i, 1: j}))
print(sameModel.predict(sc.parallelize(test_set)).collect())  # 预测多值,返回一个RDD数据集
print(sameModel.weights)  # 返回参数

# -----------------岭回归------------------

from pyspark.mllib.regression import RidgeRegressionWithSGD

data = [
    LabeledPoint(1.0, [1.0, 1.0]),
    LabeledPoint(4.0, [1.0, 3.0]),
示例#30
0
sampleOHEDictManual[(2, 'salmon')] = 6

# COMMAND ----------

# MAGIC %md #### ** (1b) Sparse vectors **
# MAGIC #### Data points can typically be represented with a small number of non-zero OHE features relative to the total number of features that occur in the dataset.  By leveraging this sparsity and using sparse vector representations of OHE data, we can reduce storage and computational burdens.  Below are a few sample vectors represented as dense numpy arrays.

# COMMAND ----------

import numpy as np
from pyspark.mllib.linalg import SparseVector

# COMMAND ----------

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [1, 3], [3., 4.])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [3], [1.])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)

# COMMAND ----------

# MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0.  For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`.

# COMMAND ----------
示例#31
0
def vectorizeBi(row,dico):
    vector_dict={}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]]=1
    return (row.label,SparseVector(len(dico),vector_dict))
        # print (Vectors.sparse(row[0][0],row[0][1],row[0][2]) )
        print(row)
        break

    exit()
    """

    # zipping the word with its probability distribution for that topic
    termsRDD = topicsRDD.map(lambda topic:
                             (zip(itemgetter(*topic[0])(vocablist), topic[1])))

    zippedRDD = topicsRDD.map(lambda topic: (zip(topic[0], topic[1])))

    # for Every topic, sparse vector of distribution over words
    docCalcs = zippedRDD.map(lambda topic: DenseVector(
        (SparseVector(vocabSize, topic)).toArray()))

    #schema = StructType([StructField("topicwordDistribution", Vector(), False)])

    #df = sqlContext.applySchema(docCalcs, schema)

    #docCalcs = docCalcs.map(lambda l: Row(l))

    docCalcs = docCalcs.zipWithIndex()

    #docCalcs = docCalcs.collect()

    docCalcs = sqlContext.createDataFrame(docCalcs,
                                          ['topicwordDistribution', 'topicId'])

    #print(type(docCalcs))
示例#33
0
def get_training_vector(classification, term_list, classifications,
                        number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))
示例#34
0
# Use a single-column SciPy csc_matrix as a sparse vector.
sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))





# Example 11-3
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))






# Example 11-5
from numpy import array
from pyspark.mllib.linalg import Vectors

# Create the dense vector <1.0, 2.0, 3.0> 
denseVec1 = array([1.0, 2.0, 3.0]) # NumPy arrays can be passed directly to MLlib
denseVec2 = Vectors.dense([1.0, 2.0, 3.0]) # .. or you can use the Vectors class

# Create the sparse vector <1.0, 0.0, 2.0, 0.0>; the methods for this take only
示例#35
0
 def test_sparse_vector_iteration(self):
     self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
     self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
示例#36
0
    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model
    output_dir = 'output/'
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print('sameModel accuracy {}'.format(accuracy))

    from pyspark.mllib.linalg import SparseVector
    testsparsevector = SparseVector(692, [5, 6], [5.0, 6.0])

    print(sameModel.predict(testsparsevector))

    # $example off$
    sc.stop()
Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')],
                        'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
                        "incorrect value for sampleOHEDictManual[(2,'mouse')]")
Test.assertEqualsHashed(sampleOHEDictManual[(2,'salmon')],
                        'c1dfd96eea8cc2b62785275bca38ac261256e278',
                        "incorrect value for sampleOHEDictManual[(2,'salmon')]")
Test.assertEquals(len(sampleOHEDictManual.keys()), 7,
                  'incorrect number of keys in sampleOHEDictManual')


# ** Sparse vectors **
import numpy as np
from pyspark.mllib.linalg import SparseVector

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)


# TEST Sparse Vectors
Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(aDense.dot(w) == aSparse.dot(w),
示例#38
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize(
            [LabeledPoint(1.0, self.dv1),
             LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))

    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize(
            [IndexedRow(0, [1, 2, 3]),
             IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def normalized_labeledpoint(line,nor):
	line_spl = line.split(' :: ')
	return LabeledPoint(line_spl[0], nor.transform(SparseVector.parse(line_spl[1])))
示例#40
0
def parseVector(line):
    _,indices_tuple_ls = line.split('\t')
    indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list.
    return SparseVector(TOTAL_DOCS,indices_tuple_ls)
示例#41
0
# Created by Raju Kumar Mishra
# Book PySpark Recipes
# Chapter 9
# Recipe 9-2.  Create a Sparse Vector.
# Run following PySpark code lines, line by line in PySpark shell

from pyspark.mllib.linalg import SparseVector
sparseDataList = [1.0, 3.2]
sparseDataVector = SparseVector(8, [0, 7], sparseDataList)
sparseDataVector
sparseDataVector[1]
sparseDataVector[7]
sparseDataVector.numNonzeros()
sparseDataList1 = [3.0, 1.4, 2.5, 1.2]
sparseDataVector1 = SparseVector(8, [0, 3, 4, 6], sparseDataList1)
squaredDistance = sparseDataVector.squared_distance(sparseDataVector1)
squaredDistance
def add_sparse_vector(vec1, vec2):
    t = vec1.toArray() + vec2.toArray()
    idx, val = get_sparse_index(t)
    return SparseVector(NUMBER_OF_CATEGORY, idx, val)