Пример #1
0
 def test_dot(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     self.assertEqual(10.0, dv.dot(lil))
Пример #2
0
    def get_ratings(self, res_id, ratings, top):
        if res_id not in self.models.keys():
            logger.info("Keys: " + str(self.models.keys()))
            logger.info("Key Type: " + str(type(self.models.keys()[0])))
            logger.info("res_id: " + str(res_id))
            logger.info("res_id type:" + str(type(res_id)))
            logger.info("res_id not known")
            return []
        
        pf = self.models[res_id].productFeatures()
         
        user_pf = pf.filter(lambda x: x[0] in ratings)
        if len(user_pf.collect()) == 0:
            logger.info("No product matches")
            return []
        user_f = user_pf.collect()
        tmp = DenseVector(user_f[0][1])
        for i in xrange(1, len(user_f)):
            tmp = tmp + user_f[i][1]
        #user_f = user_pf.reduce(lambda x, y : DenseVector(x[1]) + DenseVector(y[1]))
        estimate_score = pf.map(lambda x: (x[0], tmp.dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
 
        #estimate_score = pf.map(lambda x: (x[0], DenseVector(user_f).dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
        estimate_pid = map(lambda x: x[0], estimate_score)
        
        return estimate_pid
Пример #3
0
 def test_squared_distance(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 3
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
     self.assertEqual(15.0, dv.squared_distance(lil))
     self.assertEqual(15.0, sv.squared_distance(lil))
Пример #4
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
Пример #5
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))
Пример #6
0
    def DGEMV(alpha, A, x, beta, y, jsc):

        # First form y:= beta * y.
        if (beta != 1.0):
            if (beta == 0.0):
                y = Vectors.zeros(y.size)

        else:
            y = beta * y

        if (alpha == 0.0):
            return y

        broadcastVector = jsc.broadcast(x)
        broadcastAlpha = jsc.broadcast(alpha)

        result = A.rows.map(lambda currentRow: L2.MultiplyRows(currentRow.index,
                                                                 broadcastAlpha.value,
                                                                 currentRow.vector,
                                                                 broadcastVector.value))\
            .sortByKey()\
            .values()\
            .collect()

        resultVector = DenseVector(result)

        y = y + resultVector

        return y
Пример #7
0
 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      array.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)
Пример #8
0
def train_transform_func(vector):
    # Remap the value to elimiate NAN result for ease of calculation
    new_vec = DenseVector.toArray().map(lambda x: 0 if math.isnan(x) else x)
    arimaModel = ARIMA.fit_model(1, 0, 0, new_vec)
    forecasted = arimaModel.forecast(new_vec, 5)  # 5 days for predict
    print(type(forecasted))
    exit(0)
Пример #9
0
    def _transform(self, row):
        """Transforms the sparse vector to a dense vector while putting it in a new column."""
        sparse_vector = row[self.input_column]
        dense_vector = DenseVector(sparse_vector.toArray())
        new_row = new_dataframe_row(row, self.output_column, dense_vector)

        return new_row
Пример #10
0
    def cat2Num(self, df, indices):
        '''sbaronia - extract the categorical data and make df out of it
        so oneHotEncoding can be run on them'''
        protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache()
        protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache()

        ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache()
        ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache()
        
        '''sbaronia - add those hot encoded features columns to original df'''
        int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache()
        int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache()

        '''sbaronia - now create a new column features which has 
        converted vector form and drop rest columns'''
        comb_udf = udf(replaceCat2Num,StringType())
        int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \
                                       comb_udf(int_join_2.rawFeatures, \
                                       int_join_2.num_features0, \
                                       int_join_2.num_features1).alias("features")).cache()
        
        '''sbaronia - convert list of numerical features to DenseVector
        so they can be used in KMeans'''
        dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT())
        feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache()
      
        return feat
Пример #11
0
 def test_load_vectors(self):
     import shutil
     data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
     temp_dir = tempfile.mkdtemp()
     load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
     try:
         self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
         ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
         ret = ret_rdd.collect()
         self.assertEqual(len(ret), 2)
         self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
         self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
     except:
         self.fail()
     finally:
         shutil.rmtree(load_vectors_path)
def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (weights.dot(DenseVector(lp.features)) - lp.label) * DenseVector(
        lp.features)
Пример #13
0
 def test_model_setters(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([-1.0, -1.0, -1.0]))
def cosineSimilarity(candidateTfIdf):
    frequencyDenseVectors_1 = candidateTfIdf.map(
        lambda vector: DenseVector(vector.toArray()))
    y1 = frequencyDenseVectors_1.collect()
    re = frequencyDenseVectors_0.map(lambda x: (x.dot(y1[0])) /
                                     (x.norm(2) * y1[0].norm(2)))
    return re
Пример #15
0
 def mapFn(row):
     pvals = []
     for predictor in predictors:
         predictor_index = lookup[predictor]
         if isinstance(dm[predictor], list):
             try:
                 encoded_val = dm[predictor].index(row[predictor_index])
                 if setToFlag == None:
                     pvals.append(encoded_val)
                 else:
                     flags = [0.0] * len(dm[predictor])
                     flags[encoded_val] = setToFlag
                     pvals += flags
             except ValueError:
                 if setToFlag == None:
                     pvals.append(None)
                 else:
                     pvals += [0.0] * len(dm[predictor])
         else:
             pval = row[predictor_index]
             # if pval == None:
             #    pval_min = dm[predictor]["min"]
             #    pval_max = dm[predictor]["max"]
             #    pval=pval_min+(pval_max - pval_min)*0.5
             pvals.append(pval)
     dv = DenseVector(pvals)
     if target_index == -1:
         return (row, dv)
     tval = row[target_index]
     if isinstance(dm[target], list):  # target is categorical
         try:
             tval = dm[target].index(tval)
         except ValueError:
             tval = None
     return (row, LabeledPoint(tval, dv))
def parseLine(line):
    # Get Values
    label = line.severity
    # print('label: ', label)
    features = DenseVector(line.result)
    # print(features)
    return LabeledPoint(get_label_point(label), features)
Пример #17
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Пример #18
0
    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(eprod.transform(sparsevec), SparseVector(3, [0], [3]))
Пример #19
0
 def _predict(self, iterator):
     model = deserialize_keras_model(self.model)
     for row in iterator:
         X = np.asarray([row[self.features_column]])
         Y = model.predict(X)
         v = DenseVector(Y[0])
         new_row = new_dataframe_row(row, self.output_column, v)
         yield new_row
Пример #20
0
 def test_list_int(self):
     for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
                     SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
                     array.array('d', [1.0, 2.0])]:
         vs = VectorSlicer(indices=indices)
         self.assertListEqual(vs.getIndices(), [1, 2])
         self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
     self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
Пример #21
0
def parse_imr_w2v_vector(v_str):
    """
    creates a spark DenseVectorvectors from string lines
    :param v_str:
    :return:
    """
    from pyspark.mllib.linalg import DenseVector
    num_vec = map(float, v_str.split(';')[1:])  # TODO:? discard labels
    return DenseVector(num_vec)
Пример #22
0
 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
     sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
     # this is done as Dense and Sparse matrices can be semantically
     # equal while still implementing a different __eq__ method
     self.assertEqual(dm1, sm1)
     self.assertEqual(sm1, dm1)
Пример #23
0
 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
Пример #24
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
Пример #25
0
 def test_squared_distance(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([4, 3, 2, 1])
     lst1 = [4, 3, 2, 1]
     arr = pyarray.array('d', [0, 2, 1, 3])
     narr = array([0, 2, 1, 3])
     self.assertEqual(15.0, _squared_distance(sv, dv))
     self.assertEqual(25.0, _squared_distance(sv, lst))
     self.assertEqual(20.0, _squared_distance(dv, lst))
     self.assertEqual(15.0, _squared_distance(dv, sv))
     self.assertEqual(25.0, _squared_distance(lst, sv))
     self.assertEqual(20.0, _squared_distance(lst, dv))
     self.assertEqual(0.0, _squared_distance(sv, sv))
     self.assertEqual(0.0, _squared_distance(dv, dv))
     self.assertEqual(0.0, _squared_distance(lst, lst))
     self.assertEqual(25.0, _squared_distance(sv, lst1))
     self.assertEqual(3.0, _squared_distance(sv, arr))
     self.assertEqual(3.0, _squared_distance(sv, narr))
def computeCost(featuresAndPrediction, model):
    allClusterCenters = [DenseVector(c) for c in model.clusterCenters()]
    arrayCollection   = featuresAndPrediction.rdd.map(array)

    def error(point, predictedCluster):
        center = allClusterCenters[predictedCluster]
        z      = point - center
        return sqrt((z*z).sum())
    
    return arrayCollection.map(lambda row: error(row[0], row[1])).reduce(lambda x, y: x + y)
Пример #27
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
def processDf(df):
    mlSourceDF = df
    mlSourceDF.printSchema()
    mlSourceDF = mlSourceDF.fillna(
        0, subset=[x for x in mlSourceDF.columns if 'Lag' in x])
    mlSourceDF = mlSourceDF.na.drop(
        subset=["ServerIP", "SessionStartHourTime"])
    columnsForIndex = [
        'dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth',
        'hourofday', 'Holiday', 'BusinessHour', 'Morning'
    ]
    mlSourceDF = mlSourceDF.fillna(0, subset=[x for x in columnsForIndex])
    scoreDF = mlSourceDF

    # indexing
    scoreDF = indexModel.transform(scoreDF)
    # encoding
    scoreDFCat = ohPipelineModel.transform(scoreDF)

    # feature scaling
    featuresForScale = [x for x in scoreDFCat.columns if 'Lag' in x]
    assembler = VectorAssembler(inputCols=featuresForScale,
                                outputCol="features")
    assembled = assembler.transform(scoreDFCat).select('key', 'features')
    scaledData = scaler.transform(assembled).select('key', 'scaledFeatures')

    def extract(row):
        return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values)

    from pyspark.sql.types import Row
    from pyspark.mllib.linalg import DenseVector
    rdd = scaledData.rdd.map(
        lambda x: Row(key=x[0], scaledFeatures=DenseVector(x[1].toArray())))
    scaledDf = rdd.map(extract).toDF(["key"])
    # rename columns
    oldColumns = scaledDf.columns
    scaledColumns = ['scaledKey']
    scaledColumns.extend(['scaled' + str(i) for i in featuresForScale])
    scaledOutcome = scaledDf.select([
        col(oldColumns[index]).alias(scaledColumns[index])
        for index in range(0, len(oldColumns))
    ])
    scaledOutcome.show(1)
    scaledOutcome.cache()
    noScaledMLSourceDF = scoreDFCat.select([
        column for column in scoreDFCat.columns
        if column not in featuresForScale
    ])
    noScaledMLSourceDF.cache()
    noScaledMLSourceDF.printSchema()
    scaledOutcome.printSchema()
    newDF = noScaledMLSourceDF.join(
        scaledOutcome, (noScaledMLSourceDF.key == scaledOutcome.scaledKey),
        'outer')
    return newDF
Пример #29
0
def to_dense_vector(value, n_dim=2):
    """Converts the value to a one-hot encoded vector.

    # Arguments
        value: float. Value of the single "hot" value.
        n_dim: int. Dimension of the output vector.
    """
    vector = np.zeros(n_dim)
    vector[value] = 1.0

    return DenseVector(vector)
Пример #30
0
 def toVector(value):
     """
     Convert a value to a MLlib Vector, if possible.
     """
     if isinstance(value, Vector):
         return value
     elif TypeConverters._can_convert_to_list(value):
         value = TypeConverters.toList(value)
         if all(map(lambda v: TypeConverters._is_numeric(v), value)):
             return DenseVector(value)
     raise TypeError("Could not convert %s to vector" % value)
Пример #31
0
    def _transform(self, iterator):
        rows = []
        try:
            for row in iterator:
                label = row[self.input_column]
                v = DenseVector(to_vector(label, self.output_dim).tolist())
                new_row = new_dataframe_row(row, self.output_column, v)
                rows.append(new_row)
        except TypeError:
            pass

        return iter(rows)
Пример #32
0
def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    label = observation.label
    features = DenseVector(observation.features)
    weights = DenseVector(weights)
    prediction = DenseVector.dot(weights, features)
    result = (label, prediction)
    return result
Пример #33
0
def linregGradientDescent(trainData, numIters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.
    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.
    Args:
        trainData (RDD of LabeledPoint): The labeled data for use in training the model.
        numIters (int): The number of iterations of gradient descent to perform.
    Returns:
        (np.ndarray, np.ndarray): A tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = trainData.count()
    # The number of features in the training data
    d = len(trainData.take(1)[0].features)
    w = np.zeros(d)
    w = DenseVector(w)
    alpha = 1.0
    # We will compute and store the training error after each iteration
    errorTrain = np.zeros(numIters)
    for i in range(numIters):
        # Use getLabeledPrediction from (3b) with trainData to obtain an RDD of (label, prediction)
        # tuples.  Note that the weights all equal 0 for the first iteration, so the predictions will
        # have large errors to start.
        labelsAndPredsTrain = trainData.map(
            lambda lp: getLabeledPrediction(w, lp))
        errorTrain[i] = calcRMSE(labelsAndPredsTrain)

        # Calculate the `gradient`.  Make use of the `gradientSummand` function you wrote in (3a).
        # Note that `gradient` sould be a `DenseVector` of length `d`.
        gradient = trainData.map(lambda lp: gradientSummand(w, lp)).reduce(
            lambda a, b: a + b)
        gradient = DenseVector(gradient)
        #gradient = trainData.map(lambda lp: gradientSummand(w,lp)).reduce(lambda one, two: one+two);
        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i + 1))
        w -= alpha_i * gradient
    return w, errorTrain
Пример #34
0
    def fonction(line):

        # Parsing csv line
        values = list(csv.reader(StringIO(line)))[0]
        if is_train:
            PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values
        else:
            PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values

        # Sex
        sex_code = 0 if Sex == "male" else 1

        # Age
        safe_age = float(Age) if Age.isdigit() else mean_age
        try:
            safe_fare = float(Fare)
        except:
            safe_fare = mean_fare

        # Embarked
        safe_embarked = Embarked if Embarked else most_frequent_embarked
        code_embarked = 0 if safe_embarked == 'S' else 1 if safe_embarked == 'C' else 2  # == 'Q'

        # Title
        title = Name.split(",")[1].split(".")[0]
        code_title = 0
        if title == "Mr":
            code_title = 1
        elif title == "Miss":
            code_title = 2
        elif title == "Mrs":
            code_title = 3
        elif title == "Master":
            code_title = 4

        # Pclass code
        code_pclass = int(Pclass) - 1

        # child
        child_flag = 1 if safe_age <= 6 else 0

        features_vector = DenseVector([
            sex_code, safe_age, code_pclass,
            int(Parch),
            int(SibSp), safe_fare, code_embarked, code_title, child_flag,
            int(Parch)
        ])

        if is_train:
            return LabeledPoint(float(Survived), features_vector)
        else:
            return features_vector
Пример #35
0
    def _transform(self, row):
        """Appends the desired binary label column."""
        value = row[self.input_column]
        vector = np.zeros(2)
        # Check if the name matches.
        if value == self.label:
            vector[0] = 1.0
        else:
            vector[1] = 1.0
        # Convert to a Spark DenseVector
        vector = DenseVector(vector)

        return new_dataframe_row(row, self.output_column, vector)
Пример #36
0
def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (DenseVector.dot(lp.features, weights) - lp.label) * lp.features
Пример #37
0
def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    return (observation.label, DenseVector.dot(observation.features, weights))
Пример #38
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
Пример #39
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# In[28]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print "\nnumpyVector:\n{0}".format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])  # <FILL IN>
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(DenseVector(numpyVector))  # <FILL IN>

print "myDenseVector:\n{0}".format(myDenseVector)
print "\ndenseDotProduct:\n{0}".format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), "myDenseVector is not a DenseVector")
Test.assertTrue(np.allclose(myDenseVector, np.array([3.0, 4.0, 5.0])), "incorrect value for myDenseVector")
Test.assertTrue(np.allclose(denseDotProduct, 0.0), "incorrect value for denseDotProduct")

# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# In[22]:

from pyspark.mllib.linalg import DenseVector


# In[25]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[26]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')
Пример #42
0
# In[97]:

from pyspark.mllib.linalg import DenseVector


# In[98]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[99]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **
Пример #43
0
expectedError = [79.72013547, 30.27835699,  9.27842641,  9.20967856,  9.19446483]
Test.assertTrue(np.allclose(exampleErrorTrain, expectedError),
                'value of exampleErrorTrain is incorrect')


# #### ** (3d) Train the model **
# #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.  Note that the test set will not be used here.  If we evaluated the model on the test set, we would bias our final results.
# #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b).

# In[44]:

# TODO: Replace <FILL IN> with appropriate code
numIters = 50
weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters);

labelsAndPreds = parsedValData.map(lambda lp: (lp.label,DenseVector.dot(weightsLR0,lp.features)))
rmseValLR0 = calcRMSE(labelsAndPreds)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(rmseValBase,
                                                                       rmseValLR0)


# In[45]:

# TEST Train the model (3d)
expectedOutput = [22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084,
                  15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361]
Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0')


# #### ** Visualization 4: Training error **
# In[29]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **
zeros = np.zeros(8) # returns an array of 8 0s [ 0.  0.  0.  0.  0.  0.  0.  0.]
ones = np.ones(8) # returns an array of 8 1s [ 1.  1.  1.  1.  1.  1.  1.  1.]
print 'zeros:\n{0}'.format(zeros)
print '\nones:\n{0}'.format(ones)

zerosThenOnes = np.hstack((zeros,ones))   #notice the "(("
# hstack will return [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.]
zerosAboveOnes = np.vstack((zeros,ones))   # A 2 by 8 array 
# vstack in the above example will return 	[[ 0.  0.  0.  0.  0.  0.  0.  0.]
# 											[ 1.  1.  1.  1.  1.  1.  1.  1.]]

print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes)
print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes)

# When using PySpark, we use DenseVector instead of numpy vector. Example below:

from pyspark.mllib.linalg import DenseVector

numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) # DenseVector.dot() does the dot product

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)