示例#1
0
 def test_idf(self):
     dataset = self.spark.createDataFrame([(DenseVector([1.0, 2.0]), ),
                                           (DenseVector([0.0, 1.0]), ),
                                           (DenseVector([3.0, 0.2]), )],
                                          ["tf"])
     idf0 = IDF(inputCol="tf")
     self.assertListEqual(idf0.params,
                          [idf0.inputCol, idf0.minDocFreq, idf0.outputCol])
     idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
     self.assertEqual(
         idf0m.uid, idf0.uid,
         "Model should inherit the UID from its parent estimator.")
     output = idf0m.transform(dataset)
     self.assertIsNotNone(output.head().idf)
     # Test that parameters transferred to Python Model
     check_params(self, idf0m)
    def test_dataframe_with_empty_partition(self):
        from bigdl.orca import OrcaContext
        sc = OrcaContext.get_spark_context()
        rdd = sc.range(0, 10)

        rdd_with_empty = rdd.repartition(4).\
            mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part)

        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                           int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
示例#3
0
 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      pyarray.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)
    def test_dataframe_shard_size(self):
        from bigdl.orca import OrcaContext
        OrcaContext._shard_size = 3
        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
        OrcaContext._shard_size = None
示例#5
0
def tfIdfAsNewFeaturesBis(row):
    vector = row['tf_idf']
    data = row.asDict()    
    data['features'] = DenseVector(vector.toArray())
    newRow = Row(*data.keys())
    newRow = newRow(*data.values())
    return newRow
示例#6
0
def tfIdfAsNewFeatures(row):
    vector = row['tf_idf']
    data = row.asDict()    
    data['features'] = DenseVector([len(vector.indices), vector.values.min(), vector.values.max(), vector.values.mean()])
    newRow = Row(*data.keys())
    newRow = newRow(*data.values())
    return newRow
示例#7
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0],
                  [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     arr = pyarray.array("d", [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(
         array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(
         array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
示例#8
0
def traverse(obj, path=None, callback=None):
    """
    Traverse a deep nested python structure
    :param obj: object to traverse
    :param path:
    :param callback: Function used to transform a value
    :return:
    """
    if path is None:
        path = []

    if is_(obj, dict):
        value = {k: traverse(v, path + [k], callback) for k, v in obj.items()}

    elif is_(obj, list):
        value = [traverse(elem, path + [[]], callback) for elem in obj]

    elif is_(obj, tuple):
        value = tuple(traverse(elem, path + [[]], callback) for elem in obj)
    elif is_(obj, DenseVector):
        value = DenseVector(
            [traverse(elem, path + [[]], callback) for elem in obj])
    else:
        value = obj

    if callback is None:  # if a callback is provided, call it to get the new value
        return value
    else:
        return callback(path, value)
示例#9
0
        def predict(rows):
            from pyspark import Row
            from pyspark.ml.linalg import DenseVector, SparseVector

            model = deserialize(serialized_model)
            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()

                # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a
                # dense torch tensor.
                data = [torch.tensor([row[col]]).reshape(shape) for
                        col, shape in zip(feature_cols, input_shapes)]

                with torch.no_grad():
                    preds = model(*data)

                if not isinstance(preds, list) and not isinstance(preds, tuple):
                    preds = [preds]

                for label_col, output_col, pred in zip(label_cols, output_cols, preds):
                    meta = metadata[label_col]
                    col_type = meta['spark_data_type']
                    # dtype for dense and spark tensor is always np.float64
                    if col_type == DenseVector:
                        shape = np.prod(pred.shape)
                        flattened_pred = pred.reshape(shape, )
                        field = DenseVector(flattened_pred)
                    elif col_type == SparseVector:
                        shape = meta['shape']
                        flattened_pred = pred.reshape(shape, )
                        nonzero_indices = flattened_pred.nonzero()[0]
                        field = SparseVector(shape, nonzero_indices,
                                             flattened_pred[nonzero_indices])
                    elif pred.shape.numel() == 1:
                        # If the column is scalar type, int, float, etc.
                        value = pred.item()
                        python_type = util.spark_scalar_to_python_type(col_type)
                        if issubclass(python_type, numbers.Integral):
                            value = round(value)
                        field = python_type(value)
                    else:
                        field = DenseVector(pred.reshape(-1))

                    fields[output_col] = field

                yield Row(**fields)
示例#10
0
        def toDense(v):
            print(v)
            print(Vectors.dense(v).toArray())
            v = DenseVector(v)

            new_array = list([int(x) for x in v])

            return new_array
示例#11
0
 def test_list_int(self):
     for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
                     SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
                     pyarray.array('d', [1.0, 2.0])]:
         vs = VectorSlicer(indices=indices)
         self.assertListEqual(vs.getIndices(), [1, 2])
         self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
     self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
示例#12
0
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i,
                             feature_cols):

    # APPLY CHI-SQUARE SELECTOR
    name = f"ChiSquareSelectorModel_{i}"
    selector_model_path = Path(spark_artefacts_dir).joinpath(name)

    if run_mode == 'first':

        # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better)
        r = ChiSquareTest.test(ml_df, "features", "label")
        pValues = r.select("pvalues").collect()[0][0].tolist()
        stats = r.select("statistics").collect()[0][0].tolist()
        dof = r.select("degreesOfFreedom").collect()[0][0]

        # ChiSq Selector
        selector = ChiSqSelector(numTopFeatures=10,
                                 featuresCol="features",
                                 outputCol="selected_features",
                                 labelCol="label")
        selector_model = selector.fit(ml_df)
        selector_model.write().overwrite().save(
            str(selector_model_path.absolute()))

        top_10_feaures_importance = []
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_feaures_importance.append(feature_cols[j])
            top_10_features.append(feature_cols[j])
            top_10_feaures_importance.append(stats[j])

        model_info = [
            name,
            ml_df.count(), None, None, None, None, None, None, None
        ] + top_10_feaures_importance
        model_info_df = spark.createDataFrame(data=[model_info],
                                              schema=MODEL_INFO_SCHEMA)
        model_info_df.write.jdbc(CONNECTION_STR,
                                 'model_info',
                                 mode='append',
                                 properties=CONNECTION_PROPERTIES)

    elif run_mode == 'incremental':
        selector_model = ChiSqSelectorModel.load(
            str(selector_model_path.absolute()))
        top_10_features = []
        for j in selector_model.selectedFeatures:
            top_10_features.append(feature_cols[j])

    ml_df_10 = selector_model.transform(ml_df)
    ml_df_10 = ml_df_10.drop("features")

    #Solve a problem with ChiSqSelector and Tree-based algorithm
    ml_rdd_10 = ml_df_10.rdd.map(
        lambda row: Row(label=row[0], features=DenseVector(row[1].toArray())))
    ml_df_10 = spark.createDataFrame(ml_rdd_10)

    return ml_df_10, top_10_features
示例#13
0
        def predict(rows):
            from pyspark import Row
            from pyspark.ml.linalg import DenseVector, SparseVector

            model = deserialize(serialized_model)
            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()
                preds = prediction_fn(model, row)

                if not isinstance(preds, list) and not isinstance(
                        preds, tuple):
                    preds = [preds]

                for label_col, output_col, pred in zip(label_cols, output_cols,
                                                       preds):
                    meta = metadata[label_col]
                    col_type = meta['spark_data_type']
                    # dtype for dense and spark tensor is always np.float64
                    if col_type == DenseVector:
                        shape = np.prod(pred.shape)
                        flattened_pred = pred.reshape(shape, )
                        field = DenseVector(flattened_pred)
                    elif col_type == SparseVector:
                        shape = meta['shape']
                        flattened_pred = pred.reshape(shape, )
                        nonzero_indices = flattened_pred.nonzero()[0]
                        field = SparseVector(shape, nonzero_indices,
                                             flattened_pred[nonzero_indices])
                    elif pred.shape.numel() == 1:
                        # If the column is scalar type, int, float, etc.
                        value = pred.item()
                        python_type = util.spark_scalar_to_python_type(
                            col_type)
                        if issubclass(python_type, numbers.Integral):
                            value = round(value)
                        field = python_type(value)
                    else:
                        field = DenseVector(pred.reshape(-1))

                    fields[output_col] = field

                values = [fields[col] for col in final_output_cols]

                yield Row(*values)
示例#14
0
 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
     sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
     # this is done as Dense and Sparse matrices can be semantically
     # equal while still implementing a different __eq__ method
     self.assertEqual(dm1, sm1)
     self.assertEqual(sm1, dm1)
示例#15
0
 def test_new_java_array(self):
     # test array of strings
     str_list = ["a", "b", "c"]
     java_class = self.sc._gateway.jvm.java.lang.String
     java_array = JavaWrapper._new_java_array(str_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), str_list)
     # test array of integers
     int_list = [1, 2, 3]
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array(int_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), int_list)
     # test array of floats
     float_list = [0.1, 0.2, 0.3]
     java_class = self.sc._gateway.jvm.java.lang.Double
     java_array = JavaWrapper._new_java_array(float_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), float_list)
     # test array of bools
     bool_list = [False, True, True]
     java_class = self.sc._gateway.jvm.java.lang.Boolean
     java_array = JavaWrapper._new_java_array(bool_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), bool_list)
     # test array of Java DenseVectors
     v1 = DenseVector([0.0, 1.0])
     v2 = DenseVector([1.0, 0.0])
     vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)]
     java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector
     java_array = JavaWrapper._new_java_array(vec_java_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), [v1, v2])
     # test empty array
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array([], java_class)
     self.assertEqual(_java2py(self.sc, java_array), [])
     # test array of array of strings
     str_list = [["a", "b", "c"], ["d", "e"], ["f", "g", "h", "i"], []]
     expected_str_list = [
         ("a", "b", "c", None),
         ("d", "e", None, None),
         ("f", "g", "h", "i"),
         (None, None, None, None),
     ]
     java_class = self.sc._gateway.jvm.java.lang.String
     java_array = JavaWrapper._new_java_array(str_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), expected_str_list)
示例#16
0
 def test_squared_distance(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([4, 3, 2, 1])
     lst1 = [4, 3, 2, 1]
     arr = pyarray.array('d', [0, 2, 1, 3])
     narr = array([0, 2, 1, 3])
     self.assertEqual(15.0, _squared_distance(sv, dv))
     self.assertEqual(25.0, _squared_distance(sv, lst))
     self.assertEqual(20.0, _squared_distance(dv, lst))
     self.assertEqual(15.0, _squared_distance(dv, sv))
     self.assertEqual(25.0, _squared_distance(lst, sv))
     self.assertEqual(20.0, _squared_distance(lst, dv))
     self.assertEqual(0.0, _squared_distance(sv, sv))
     self.assertEqual(0.0, _squared_distance(dv, dv))
     self.assertEqual(0.0, _squared_distance(lst, lst))
     self.assertEqual(25.0, _squared_distance(sv, lst1))
     self.assertEqual(3.0, _squared_distance(sv, arr))
     self.assertEqual(3.0, _squared_distance(sv, narr))
示例#17
0
    def formatFeaturesDF(self, featuresRawRDD, outputColName):

        #Convert RDD Data into DataFrame
        dataframe_features = featuresRawRDD.map(
            lambda x: (x[0], x[1], x[2], DenseVector((x[3]).split(',')))).toDF(
                ["index", "url", "productId", "features"])

        trainNormalizeFeatures = self.getNormalizer(dataframe_features,
                                                    outputColName)
        return trainNormalizeFeatures
示例#18
0
def regression_on_player(joined_records_ratings,player_id,birthDate,match_date):
	joined_records_ratings = joined_records_ratings.filter(joined_records_ratings.Id == player_id).select(["Id","playerid_date","rating","birthDate"])
	joined_records_ratings = joined_records_ratings.filter(joined_records_ratings.playerid_date.date != "0000-00-00")

	l1 = []
	dateArrybirth = birthDate.split('-')
	date_birth = datetime.datetime(int(dateArrybirth[0]),int(dateArrybirth[1]),int(dateArrybirth[2]))

	for j in joined_records_ratings.rdd.collect():
		dateArraycur = j.playerid_date.date.split('-')
		date_current = datetime.datetime(int(dateArraycur[0]),int(dateArraycur[1]),int(dateArraycur[2]))
		dt_age = date_current - date_birth
		age = (dt_age.days)
		squareAge = age*age
		b=(j.Id,DenseVector([float(age),float(squareAge)]),j.rating)
		l1.append(b)
	
	df2_normal_features = spark.sparkContext.parallelize(l1).toDF(["Id","features","label"])
	df_train_reg = df2_normal_features.select(["features","label"])

	#Fit the model
	lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=10, regParam=0.0, elasticNetParam=0.0)
	lrModel = lr.fit(df_train_reg)

	df_train_reg.show()

	l2 = []
	dateArraycur = match_date.split('-')
	date_current = datetime.datetime(int(dateArraycur[0]),int(dateArraycur[1]),int(dateArraycur[2]))
	dt_age = date_current - date_birth
	age = (dt_age.days)
	squareAge = age*age
	b=(DenseVector([float(age),float(squareAge)]),1)
	l2.append(b)
	df_test = spark.sparkContext.parallelize(l2).toDF(["features","label"])
	df_test = df_test.select("features")


	#transform
	lr_predictions = lrModel.transform(df_test)
	predicted_rating = lr_predictions.collect()[0].prediction
	
	return predicted_rating
    def test_distance_mesaure(self):

        # Dummy testing!
        x = np.array([
            0.,
            0.,
            0.,
        ])
        y = np.array([
            0.9,
            0.9,
            0.9,
        ])
        z = np.array([
            0.1,
            0.1,
            0.1,
        ])
        v = np.array([
            0.85,
            0.85,
            0.85,
        ])
        data = [x, y, z, v]
        sigma = self.label_context.constants['sigma'].value

        for i, j in product(range(4), range(4)):
            computed_weight = _compute_weights(data[i], data[j], sigma)
            self.assertAlmostEqual(self.results[i][j], computed_weight, 5)

        # Check for sparse data
        sparse_data = [
            SparseVector(3, [], []),
            DenseVector(y),
            DenseVector(z),
            DenseVector(v)
        ]

        for i, j in product(range(4), range(4)):
            computed_weight = _compute_weights(sparse_data[i], sparse_data[j],
                                               sigma)
            self.assertAlmostEqual(self.results[i][j], computed_weight, 5)
示例#20
0
 def toVector(value):
     """
     Convert a value to a MLlib Vector, if possible.
     """
     if isinstance(value, Vector):
         return value
     elif TypeConverters._can_convert_to_list(value):
         value = TypeConverters.toList(value)
         if all(map(lambda v: TypeConverters._is_numeric(v), value)):
             return DenseVector(value)
     raise TypeError("Could not convert %s to vector" % value)
示例#21
0
    def test_tfdataset_with_dataframe(self):

        rdd = self.sc.range(0, 1000)
        df = rdd.map(lambda x: (DenseVector(
            np.random.rand(20).astype(np.float)), x % 10)).toDF(
                ["feature", "label"])
        train_df, val_df = df.randomSplit([0.7, 0.3])

        create_ds = self.make_create_ds_fn(train_df, val_df)

        self.check_dataset(create_ds)
    def formatFeaturesDF(self, featuresRawRDD, outputColName):
        x = featuresRawRDD.strip()
        dataFrameFeatures = self.sc.parallelize([x])

        dataframe_features1 = dataFrameFeatures.map(lambda line : line.split("\r\n"))\
        .flatMap(lambda words : (word.split(",") for word in words)).map(lambda x : [elem.strip('"') for elem in x])\
        .map(lambda x: (x[0], x[1], x[2],DenseVector(x[3:]))).toDF(["index", "url", "productId", "features"])

        trainNormalizeFeatures = self.getNormalizer(dataframe_features1,
                                                    outputColName)
        return trainNormalizeFeatures
示例#23
0
def PCA_transform(sc, samples_df, feature_count, threshold, k):
    # check input
    if threshold and ((threshold > 1) or (threshold < 0)):
        print "ERROR: PCA_transform: Input threshold should be within 0 to 1"
        return (None, None, None)
    if k and k < 0:
        print "ERROR: transform: Input k should be greater than 0"
        return (None, None, None)
    #print "df.shape=",df.shape

    #print "in ml_sklearn_PCA_transform()"
    df_reduced = None
    pca = None
    if not threshold is None:  # by threshold ===============
        if feature_count > 200:
            fk = 200
            print "INFO: force k to " + str(fk) + " for PCA."
        else:
            fk = feature_count

        pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        sum_ratio = 0
        # get ratio array and find n_components
        var_arr = pca_model.explainedVariance
        print "RESULT: PCA ratio_vec=", var_arr

        n_components = ml_util.ml_get_n_components(var_arr, threshold)
        '''
        for n_components,val in enumerate(var_arr):
            sum_ratio=sum_ratio+val
            if sum_ratio >= threshold:
                break
        '''
        k = n_components
        #print sum_ratio, n_components

        df_pcaed_all = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        # get k column only
        sqlCtx = SQLContext(sc)
        df_pcaed = sqlCtx.createDataFrame(
            df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[
                "pcaFeatures"].toArray()[:k])).map(lambda p: Row(
                    hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2]))))
        print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold
    elif k > 0:  # by n_components  ===============
        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        pca_model = pca.fit(samples_df)
        df_pcaed = pca_model.transform(samples_df).select(
            "hash", "label", "pcaFeatures")
        print "INFO: PCA_transform: n_components =", k

    return (df_pcaed, k, pca_model)
示例#24
0
 def __numpy_to_vector_assembler(self, np_object, label_t=1):
     """
     Numpy to spark vector converter from a numpy object
     :param np_object: numpy array with features
     :param label_t: label type column, 1 as default
     :return: build from np.array to spark DataFrame
     """
     data_set = _sc.parallelize(np_object)
     data_rdd = data_set.map(lambda x:
                             (Row(features=DenseVector(x), label=label_t)))
     self.__logger.info("Numpy to Spark Converter")
     return data_rdd.toDF()
示例#25
0
def main():
    spark = SparkSession \
        .builder \
        .appName("Reddit Site:Get Data") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
 
    file="file:////l2/corpora/reddit/submissions/RS_2015-12.bz2"
    output=file[-14:-3]

    sc = spark.sparkContext
    print('\n\n\n starting read and filter')
    df = filterPosts(file,sc,spark)
 
    df= convertToVec(df, sc, spark, output, inputCol='tokens')

    num_topics=10
    
    print('\n\n\n LDA... \n\n\n')
    newLDA=False
    if newLDA:
        lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50)
        lda_model=lda.fit(df.select('id','vectors'))
        lda_model.save(output+'_ldamodel')
    else:
        lda_model=LocalLDAModel.load(output+'_ldamodel')

    print('\n\n\n Describe Topics... \n\n\n')
    topic_indices=lda_model.describeTopics(maxTermsPerTopic=50)
    topic_indices.write.json(output+'_topics.json', mode='overwrite')
    

    print('\n\n\n reduce to subs\n\n\n')
    #subDF=df.select('subreddit','vectors').groupBy(df.subreddit).sum('vectors')
    subDF=df.select('subreddit','vectors').rdd.mapValues(lambda v: v.toArray()) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda x: DenseVector(x)) \
        .toDF(["subreddit", "vectors"])
        
    '''
    print('\n\n\n LDA... \n\n\n')

    lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50)
    lda_model=lda.fit(subDF.select('subreddit','vectors'))
    
    print('\n\n\n Describe Topics... \n\n\n')
    topic_indices=lda_model.describeTopics(maxTermsPerTopic=50)
    topic_indices.write.json(output+'_topics.json', mode='overwrite')
    '''
    print('\n\n\n Transform DataSet \n\n\n')
    subDF=lda_model.transform(subDF).drop('vectors')
    #topicDF=lda_model.transform(vecDF)
    subDF.write.json(output+'_transformed.json', mode='overwrite')
示例#26
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
示例#27
0
    def test_get_col_info(self):
        with spark_session('test_get_col_info') as spark:
            data = [[
                0, 0.0, None, [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                DenseVector([1.0, 1.0])
            ],
                    [
                        1, None, None, [1, 1],
                        DenseVector([1.0, 1.0]),
                        SparseVector(2, {1: 1.0}),
                        SparseVector(2, {1: 1.0})
                    ]]

            schema = StructType([
                StructField('int', IntegerType()),
                StructField('float', FloatType()),
                StructField('null', NullType()),
                StructField('array', ArrayType(IntegerType())),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])

            df = create_test_data_from_schema(spark, data, schema)
            all_col_types, col_shapes, col_max_sizes = util._get_col_info(df)

            expected = [('int', {int}, 1, 1), ('float', {float,
                                                         NullType}, 1, 1),
                        ('null', {NullType}, 1, 1), ('array', {list}, 2, 2),
                        ('dense', {DenseVector}, 2, 2),
                        ('sparse', {SparseVector}, 2, 1),
                        ('mixed', {DenseVector, SparseVector}, 2, 2)]

            for expected_col_info in expected:
                col_name, col_types, col_shape, col_size = expected_col_info
                assert all_col_types[col_name] == col_types, col_name
                assert col_shapes[col_name] == col_shape, col_name
                assert col_max_sizes[col_name] == col_size, col_name
示例#28
0
    def test_check_shape_compatibility(self):
        feature_columns = ['x1', 'x2', 'features']
        label_columns = ['y1', 'y_embedding']

        schema = StructType([StructField('x1', DoubleType()),
                             StructField('x2', IntegerType()),
                             StructField('features', VectorUDT()),
                             StructField('y1', FloatType()),
                             StructField('y_embedding', VectorUDT())])
        data = [[1.0, 1, DenseVector([1.0] * 12), 1.0, DenseVector([1.0] * 12)]] * 10

        with spark_session('test_df_cache') as spark:
                df = create_test_data_from_schema(spark, data, schema)
                metadata = util._get_metadata(df)

                input_shapes = [[1], [1], [-1, 3, 4]]
                output_shapes = [[1], [-1, 3, 4]]
                util.check_shape_compatibility(metadata, feature_columns, label_columns,
                                               input_shapes, output_shapes)

                input_shapes = [[1], [1], [3, 2, 2]]
                output_shapes = [[1, 1], [-1, 2, 3, 2]]
                util.check_shape_compatibility(metadata, feature_columns, label_columns,
                                               input_shapes, output_shapes)

                bad_input_shapes = [[1], [1], [-1, 3, 5]]
                with pytest.raises(ValueError):
                    util.check_shape_compatibility(metadata, feature_columns, label_columns,
                                                   bad_input_shapes, output_shapes)

                bad_input_shapes = [[2], [1], [-1, 3, 4]]
                with pytest.raises(ValueError):
                    util.check_shape_compatibility(metadata, feature_columns, label_columns,
                                                   bad_input_shapes, output_shapes)

                bad_output_shapes = [[7], [-1, 3, 4]]
                with pytest.raises(ValueError):
                    util.check_shape_compatibility(metadata, feature_columns, label_columns,
                                                   input_shapes, bad_output_shapes)
示例#29
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"])

        sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
示例#30
0
def newFeatures(row):
    vector1 = row['tf_idf']
    vector2 = row['tf_idfs']
    cos = 0
    try:
        cos = vector1.dot(vector2) / (sf.sqrt(
            vector1.dot(vector1) * vector2.dot(vector2)))
    except:
        pass
    data = row.asDict()
    data['features'] = DenseVector([cos])
    newRow = Row(*data.keys())
    newRow = newRow(*data.values())
    return newRow