Пример #1
0
    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (weights.dot(lp.features) - lp.label) * lp.features


exampleW = DenseVector([1, 1, 1])
exampleLP = LabeledPoint(2.0, [3, 1, 4])
# gradientSummand = (dot([1 1 1], [3 1 4]) - 2) * [3 1 4] = (8 - 2) * [3 1 4] = [18 6 24]
summandOne = gradientSummand(exampleW, exampleLP)
print summandOne

exampleW = DenseVector([.24, 1.2, -1.4])
exampleLP = LabeledPoint(3.0, [-1.4, 4.2, 2.1])
summandTwo = gradientSummand(exampleW, exampleLP)
print summandTwo

# In[72]:

# TEST Gradient summand (3a)
Test.assertTrue(np.allclose(summandOne, [18., 6., 24.]),
                'incorrect value for summandOne')
Пример #2
0
 def test_vector(self):
     ewp = ElementwiseProduct(scalingVec=[1, 3])
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
     ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
     self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
Пример #3
0
# MAGIC Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.
# MAGIC
# MAGIC For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# COMMAND ----------

from pyspark.mllib.linalg import DenseVector

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3, 4, 5])
# Calculate the dot product between the two vectors.
denseDotProduct = numpyVector.dot(myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

# COMMAND ----------

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector),
                'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0),
                'incorrect value for denseDotProduct')
Пример #4
0
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df).transform(df)

#S=>0 C=>1 Q=>2
df_indexed.select('Embarked', 'Embarked_indexed').show(3)

#covert features to vectors
enumVarsIndexed = [i + '_indexed' for i in enumVars]
featuresCol = numVars + catVarsIndexed
featuresCol.remove('Survived')
lableCol = ['Mark', 'Survived']
row = Row('mark', 'label', 'features')

#0 1 2 map
df_indexed = df_indexed[labelCol + featuresCol]
lf = df_indexed.map(lambda r: (row(r[0], r[1], DenseVector(r[2:])))).toDF()

lf = StringIndexer(inputCol='label', output='index').fit(lf).transform(lf)

lf.show(3)

#seperate train/test data
train = lf.where(lf.mark == 'train')
test = lf.where(lf.mark == 'test')

train, validation = train.randomSplit([0.8, 0.2], seed=110)

print 'Training Data Number: ' + str(train.count())
print 'Validation Date Number ' + str(validation.count())
print 'Test Data Number ' + str(test.count())
Пример #5
0
# MAGIC #### `DenseVector` implements several functions.  The only function needed for this course is `DenseVector.dot()`, which operates just like `np.ndarray.dot()`.
# MAGIC #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# MAGIC #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# COMMAND ----------

from pyspark.mllib.linalg import DenseVector

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = np.dot(numpyVector, myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

# COMMAND ----------

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')

# COMMAND ----------
Пример #6
0
  bin/spark-submit examples/src/main/python/ml/simple_params_example.py
"""

if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: simple_params_example", file=sys.stderr)
        exit(1)
    sc = SparkContext(appName="PythonSimpleParamsExample")
    sqlContext = SQLContext(sc)

    # prepare training data.
    # We create an RDD of LabeledPoints and convert them into a DataFrame.
    # Spark DataFrames can automatically infer the schema from named tuples
    # and LabeledPoint implements __reduce__ to behave like a named tuple.
    training = sc.parallelize([
        LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
        LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
        LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
        LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))
    ]).toDF()

    # Create a LogisticRegression instance with maxIter = 10.
    # This instance is an Estimator.
    lr = LogisticRegression(maxIter=10)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # We may also set parameters using setter methods.
    lr.setRegParam(0.01)

    # Learn a LogisticRegression model.  This uses the parameters stored in lr.
Пример #7
0
 def test_model_transform(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([1.0, 2.0, 3.0]))
Пример #8
0
# #### `DenseVector` implements several functions.  The only function needed for this course is `DenseVector.dot()`, which operates just like `np.ndarray.dot()`.
# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# In[31]:

from pyspark.mllib.linalg import DenseVector

# In[33]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3, 4, 5]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(DenseVector(numpyVector), myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

# In[34]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector),
                'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0),
                'incorrect value for denseDotProduct')
Пример #9
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
Пример #10
0
if __name__ == '__main__':

    data, out = sys.argv[1:]
    conf = SparkConf().setAppName('ResNETBroadCast')
    conf.set('spark.sql.execution.arrow.enable', 'true')
    # smaller batches for nodes with small memory
    conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1024')
    # allow overwrite s3 files
    conf.set('spark.hadoop.orc.overwrite.output.file', 'true')  #

    sc = SparkContext.getOrCreate(conf)
    spark = SQLContext(sc)
    data = spark.read.csv(data, inferSchema=True, header=True)
    # filter documents longer than MAX_REVIEW_LENGTH words
    data = data.withColumn('review_length', F.size(F.split(F.col('Text'),
                                                           ' ')))

    data = data.where(F.col('review_length') <= MAX_REVIEW_LENGTH)
    # repartition and embed
    res = data.repartition(PARTITIONS)\
            .rdd.mapPartitions(simple_embed)\
            .map(lambda x: (x.id, DenseVector(x.value)))

    frame = spark.createDataFrame(res, schema=schema)
    # out is an s3 bucket
    frame.repartition(WRITE_PARTITIONS)\
        .write.mode('overwrite')\
        .format('parquet')\
        .save(out)
Пример #11
0
 def transform(self, df):
     transformed = super(ALSBinaryModel, self).transform(df)
     as_vector = udf(lambda x: DenseVector([1 - x, x]), VectorUDT())
     return transformed.withColumn("rawPrediction",
                                   as_vector(col("prediction")))