def test_idf(self): dataset = self.spark.createDataFrame([(DenseVector([1.0, 2.0]), ), (DenseVector([0.0, 1.0]), ), (DenseVector([3.0, 0.2]), )], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) self.assertEqual( idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) # Test that parameters transferred to Python Model check_params(self, idf0m)
def test_dataframe_with_empty_partition(self): from bigdl.orca import OrcaContext sc = OrcaContext.get_spark_context() rdd = sc.range(0, 10) rdd_with_empty = rdd.repartition(4).\ mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)), int(np.random.randint(0, 1, size=()))))\ .toDF(["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), pyarray.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l)
def test_dataframe_shard_size(self): from bigdl.orca import OrcaContext OrcaContext._shard_size = 3 sc = init_nncontext() rdd = sc.range(0, 10) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect() OrcaContext._shard_size = None
def tfIdfAsNewFeaturesBis(row): vector = row['tf_idf'] data = row.asDict() data['features'] = DenseVector(vector.toArray()) newRow = Row(*data.keys()) newRow = newRow(*data.values()) return newRow
def tfIdfAsNewFeatures(row): vector = row['tf_idf'] data = row.asDict() data['features'] = DenseVector([len(vector.indices), vector.values.min(), vector.values.max(), vector.values.mean()]) newRow = Row(*data.keys()) newRow = newRow(*data.values()) return newRow
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]) arr = pyarray.array("d", [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue( array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue( array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def traverse(obj, path=None, callback=None): """ Traverse a deep nested python structure :param obj: object to traverse :param path: :param callback: Function used to transform a value :return: """ if path is None: path = [] if is_(obj, dict): value = {k: traverse(v, path + [k], callback) for k, v in obj.items()} elif is_(obj, list): value = [traverse(elem, path + [[]], callback) for elem in obj] elif is_(obj, tuple): value = tuple(traverse(elem, path + [[]], callback) for elem in obj) elif is_(obj, DenseVector): value = DenseVector( [traverse(elem, path + [[]], callback) for elem in obj]) else: value = obj if callback is None: # if a callback is provided, call it to get the new value return value else: return callback(path, value)
def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a # dense torch tensor. data = [torch.tensor([row[col]]).reshape(shape) for col, shape in zip(feature_cols, input_shapes)] with torch.no_grad(): preds = model(*data) if not isinstance(preds, list) and not isinstance(preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type(col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field yield Row(**fields)
def toDense(v): print(v) print(Vectors.dense(v).toArray()) v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array
def test_list_int(self): for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), pyarray.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) self.assertTrue(all([type(v) == int for v in vs.getIndices()])) self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i, feature_cols): # APPLY CHI-SQUARE SELECTOR name = f"ChiSquareSelectorModel_{i}" selector_model_path = Path(spark_artefacts_dir).joinpath(name) if run_mode == 'first': # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better) r = ChiSquareTest.test(ml_df, "features", "label") pValues = r.select("pvalues").collect()[0][0].tolist() stats = r.select("statistics").collect()[0][0].tolist() dof = r.select("degreesOfFreedom").collect()[0][0] # ChiSq Selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="label") selector_model = selector.fit(ml_df) selector_model.write().overwrite().save( str(selector_model_path.absolute())) top_10_feaures_importance = [] top_10_features = [] for j in selector_model.selectedFeatures: top_10_feaures_importance.append(feature_cols[j]) top_10_features.append(feature_cols[j]) top_10_feaures_importance.append(stats[j]) model_info = [ name, ml_df.count(), None, None, None, None, None, None, None ] + top_10_feaures_importance model_info_df = spark.createDataFrame(data=[model_info], schema=MODEL_INFO_SCHEMA) model_info_df.write.jdbc(CONNECTION_STR, 'model_info', mode='append', properties=CONNECTION_PROPERTIES) elif run_mode == 'incremental': selector_model = ChiSqSelectorModel.load( str(selector_model_path.absolute())) top_10_features = [] for j in selector_model.selectedFeatures: top_10_features.append(feature_cols[j]) ml_df_10 = selector_model.transform(ml_df) ml_df_10 = ml_df_10.drop("features") #Solve a problem with ChiSqSelector and Tree-based algorithm ml_rdd_10 = ml_df_10.rdd.map( lambda row: Row(label=row[0], features=DenseVector(row[1].toArray()))) ml_df_10 = spark.createDataFrame(ml_rdd_10) return ml_df_10, top_10_features
def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() preds = prediction_fn(model, row) if not isinstance(preds, list) and not isinstance( preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type( col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field values = [fields[col] for col in final_output_cols] yield Row(*values)
def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) dm1 = DenseMatrix(2, 2, [2, 0, 0, 0]) sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2]) self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) # this is done as Dense and Sparse matrices can be semantically # equal while still implementing a different __eq__ method self.assertEqual(dm1, sm1) self.assertEqual(sm1, dm1)
def test_new_java_array(self): # test array of strings str_list = ["a", "b", "c"] java_class = self.sc._gateway.jvm.java.lang.String java_array = JavaWrapper._new_java_array(str_list, java_class) self.assertEqual(_java2py(self.sc, java_array), str_list) # test array of integers int_list = [1, 2, 3] java_class = self.sc._gateway.jvm.java.lang.Integer java_array = JavaWrapper._new_java_array(int_list, java_class) self.assertEqual(_java2py(self.sc, java_array), int_list) # test array of floats float_list = [0.1, 0.2, 0.3] java_class = self.sc._gateway.jvm.java.lang.Double java_array = JavaWrapper._new_java_array(float_list, java_class) self.assertEqual(_java2py(self.sc, java_array), float_list) # test array of bools bool_list = [False, True, True] java_class = self.sc._gateway.jvm.java.lang.Boolean java_array = JavaWrapper._new_java_array(bool_list, java_class) self.assertEqual(_java2py(self.sc, java_array), bool_list) # test array of Java DenseVectors v1 = DenseVector([0.0, 1.0]) v2 = DenseVector([1.0, 0.0]) vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)] java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector java_array = JavaWrapper._new_java_array(vec_java_list, java_class) self.assertEqual(_java2py(self.sc, java_array), [v1, v2]) # test empty array java_class = self.sc._gateway.jvm.java.lang.Integer java_array = JavaWrapper._new_java_array([], java_class) self.assertEqual(_java2py(self.sc, java_array), []) # test array of array of strings str_list = [["a", "b", "c"], ["d", "e"], ["f", "g", "h", "i"], []] expected_str_list = [ ("a", "b", "c", None), ("d", "e", None, None), ("f", "g", "h", "i"), (None, None, None, None), ] java_class = self.sc._gateway.jvm.java.lang.String java_array = JavaWrapper._new_java_array(str_list, java_class) self.assertEqual(_java2py(self.sc, java_array), expected_str_list)
def test_squared_distance(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([4, 3, 2, 1]) lst1 = [4, 3, 2, 1] arr = pyarray.array('d', [0, 2, 1, 3]) narr = array([0, 2, 1, 3]) self.assertEqual(15.0, _squared_distance(sv, dv)) self.assertEqual(25.0, _squared_distance(sv, lst)) self.assertEqual(20.0, _squared_distance(dv, lst)) self.assertEqual(15.0, _squared_distance(dv, sv)) self.assertEqual(25.0, _squared_distance(lst, sv)) self.assertEqual(20.0, _squared_distance(lst, dv)) self.assertEqual(0.0, _squared_distance(sv, sv)) self.assertEqual(0.0, _squared_distance(dv, dv)) self.assertEqual(0.0, _squared_distance(lst, lst)) self.assertEqual(25.0, _squared_distance(sv, lst1)) self.assertEqual(3.0, _squared_distance(sv, arr)) self.assertEqual(3.0, _squared_distance(sv, narr))
def formatFeaturesDF(self, featuresRawRDD, outputColName): #Convert RDD Data into DataFrame dataframe_features = featuresRawRDD.map( lambda x: (x[0], x[1], x[2], DenseVector((x[3]).split(',')))).toDF( ["index", "url", "productId", "features"]) trainNormalizeFeatures = self.getNormalizer(dataframe_features, outputColName) return trainNormalizeFeatures
def regression_on_player(joined_records_ratings,player_id,birthDate,match_date): joined_records_ratings = joined_records_ratings.filter(joined_records_ratings.Id == player_id).select(["Id","playerid_date","rating","birthDate"]) joined_records_ratings = joined_records_ratings.filter(joined_records_ratings.playerid_date.date != "0000-00-00") l1 = [] dateArrybirth = birthDate.split('-') date_birth = datetime.datetime(int(dateArrybirth[0]),int(dateArrybirth[1]),int(dateArrybirth[2])) for j in joined_records_ratings.rdd.collect(): dateArraycur = j.playerid_date.date.split('-') date_current = datetime.datetime(int(dateArraycur[0]),int(dateArraycur[1]),int(dateArraycur[2])) dt_age = date_current - date_birth age = (dt_age.days) squareAge = age*age b=(j.Id,DenseVector([float(age),float(squareAge)]),j.rating) l1.append(b) df2_normal_features = spark.sparkContext.parallelize(l1).toDF(["Id","features","label"]) df_train_reg = df2_normal_features.select(["features","label"]) #Fit the model lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=10, regParam=0.0, elasticNetParam=0.0) lrModel = lr.fit(df_train_reg) df_train_reg.show() l2 = [] dateArraycur = match_date.split('-') date_current = datetime.datetime(int(dateArraycur[0]),int(dateArraycur[1]),int(dateArraycur[2])) dt_age = date_current - date_birth age = (dt_age.days) squareAge = age*age b=(DenseVector([float(age),float(squareAge)]),1) l2.append(b) df_test = spark.sparkContext.parallelize(l2).toDF(["features","label"]) df_test = df_test.select("features") #transform lr_predictions = lrModel.transform(df_test) predicted_rating = lr_predictions.collect()[0].prediction return predicted_rating
def test_distance_mesaure(self): # Dummy testing! x = np.array([ 0., 0., 0., ]) y = np.array([ 0.9, 0.9, 0.9, ]) z = np.array([ 0.1, 0.1, 0.1, ]) v = np.array([ 0.85, 0.85, 0.85, ]) data = [x, y, z, v] sigma = self.label_context.constants['sigma'].value for i, j in product(range(4), range(4)): computed_weight = _compute_weights(data[i], data[j], sigma) self.assertAlmostEqual(self.results[i][j], computed_weight, 5) # Check for sparse data sparse_data = [ SparseVector(3, [], []), DenseVector(y), DenseVector(z), DenseVector(v) ] for i, j in product(range(4), range(4)): computed_weight = _compute_weights(sparse_data[i], sparse_data[j], sigma) self.assertAlmostEqual(self.results[i][j], computed_weight, 5)
def toVector(value): """ Convert a value to a MLlib Vector, if possible. """ if isinstance(value, Vector): return value elif TypeConverters._can_convert_to_list(value): value = TypeConverters.toList(value) if all(map(lambda v: TypeConverters._is_numeric(v), value)): return DenseVector(value) raise TypeError("Could not convert %s to vector" % value)
def test_tfdataset_with_dataframe(self): rdd = self.sc.range(0, 1000) df = rdd.map(lambda x: (DenseVector( np.random.rand(20).astype(np.float)), x % 10)).toDF( ["feature", "label"]) train_df, val_df = df.randomSplit([0.7, 0.3]) create_ds = self.make_create_ds_fn(train_df, val_df) self.check_dataset(create_ds)
def formatFeaturesDF(self, featuresRawRDD, outputColName): x = featuresRawRDD.strip() dataFrameFeatures = self.sc.parallelize([x]) dataframe_features1 = dataFrameFeatures.map(lambda line : line.split("\r\n"))\ .flatMap(lambda words : (word.split(",") for word in words)).map(lambda x : [elem.strip('"') for elem in x])\ .map(lambda x: (x[0], x[1], x[2],DenseVector(x[3:]))).toDF(["index", "url", "productId", "features"]) trainNormalizeFeatures = self.getNormalizer(dataframe_features1, outputColName) return trainNormalizeFeatures
def PCA_transform(sc, samples_df, feature_count, threshold, k): # check input if threshold and ((threshold > 1) or (threshold < 0)): print "ERROR: PCA_transform: Input threshold should be within 0 to 1" return (None, None, None) if k and k < 0: print "ERROR: transform: Input k should be greater than 0" return (None, None, None) #print "df.shape=",df.shape #print "in ml_sklearn_PCA_transform()" df_reduced = None pca = None if not threshold is None: # by threshold =============== if feature_count > 200: fk = 200 print "INFO: force k to " + str(fk) + " for PCA." else: fk = feature_count pca = PCA(k=fk, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) sum_ratio = 0 # get ratio array and find n_components var_arr = pca_model.explainedVariance print "RESULT: PCA ratio_vec=", var_arr n_components = ml_util.ml_get_n_components(var_arr, threshold) ''' for n_components,val in enumerate(var_arr): sum_ratio=sum_ratio+val if sum_ratio >= threshold: break ''' k = n_components #print sum_ratio, n_components df_pcaed_all = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") # get k column only sqlCtx = SQLContext(sc) df_pcaed = sqlCtx.createDataFrame( df_pcaed_all.rdd.map(lambda p: (p["hash"], p["label"], p[ "pcaFeatures"].toArray()[:k])).map(lambda p: Row( hash=p[0], label=p[1], pcaFeatures=DenseVector(p[2])))) print "INFO: PCA_transform: n_components =", n_components, ", threshold=", threshold elif k > 0: # by n_components =============== pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") pca_model = pca.fit(samples_df) df_pcaed = pca_model.transform(samples_df).select( "hash", "label", "pcaFeatures") print "INFO: PCA_transform: n_components =", k return (df_pcaed, k, pca_model)
def __numpy_to_vector_assembler(self, np_object, label_t=1): """ Numpy to spark vector converter from a numpy object :param np_object: numpy array with features :param label_t: label type column, 1 as default :return: build from np.array to spark DataFrame """ data_set = _sc.parallelize(np_object) data_rdd = data_set.map(lambda x: (Row(features=DenseVector(x), label=label_t))) self.__logger.info("Numpy to Spark Converter") return data_rdd.toDF()
def main(): spark = SparkSession \ .builder \ .appName("Reddit Site:Get Data") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() file="file:////l2/corpora/reddit/submissions/RS_2015-12.bz2" output=file[-14:-3] sc = spark.sparkContext print('\n\n\n starting read and filter') df = filterPosts(file,sc,spark) df= convertToVec(df, sc, spark, output, inputCol='tokens') num_topics=10 print('\n\n\n LDA... \n\n\n') newLDA=False if newLDA: lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50) lda_model=lda.fit(df.select('id','vectors')) lda_model.save(output+'_ldamodel') else: lda_model=LocalLDAModel.load(output+'_ldamodel') print('\n\n\n Describe Topics... \n\n\n') topic_indices=lda_model.describeTopics(maxTermsPerTopic=50) topic_indices.write.json(output+'_topics.json', mode='overwrite') print('\n\n\n reduce to subs\n\n\n') #subDF=df.select('subreddit','vectors').groupBy(df.subreddit).sum('vectors') subDF=df.select('subreddit','vectors').rdd.mapValues(lambda v: v.toArray()) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda x: DenseVector(x)) \ .toDF(["subreddit", "vectors"]) ''' print('\n\n\n LDA... \n\n\n') lda=LDA(featuresCol='vectors', k=num_topics, maxIter=50) lda_model=lda.fit(subDF.select('subreddit','vectors')) print('\n\n\n Describe Topics... \n\n\n') topic_indices=lda_model.describeTopics(maxTermsPerTopic=50) topic_indices.write.json(output+'_topics.json', mode='overwrite') ''' print('\n\n\n Transform DataSet \n\n\n') subDF=lda_model.transform(subDF).drop('vectors') #topicDF=lda_model.transform(vecDF) subDF.write.json(output+'_transformed.json', mode='overwrite')
def test_norms(self): a = DenseVector([0, 2, 3, -1]) self.assertAlmostEqual(a.norm(2), 3.742, 3) self.assertTrue(a.norm(1), 6) self.assertTrue(a.norm(inf), 3) a = SparseVector(4, [0, 2], [3, -4]) self.assertAlmostEqual(a.norm(2), 5) self.assertTrue(a.norm(1), 7) self.assertTrue(a.norm(inf), 4) tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1)
def test_get_col_info(self): with spark_session('test_get_col_info') as spark: data = [[ 0, 0.0, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1, None, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('int', IntegerType()), StructField('float', FloatType()), StructField('null', NullType()), StructField('array', ArrayType(IntegerType())), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) all_col_types, col_shapes, col_max_sizes = util._get_col_info(df) expected = [('int', {int}, 1, 1), ('float', {float, NullType}, 1, 1), ('null', {NullType}, 1, 1), ('array', {list}, 2, 2), ('dense', {DenseVector}, 2, 2), ('sparse', {SparseVector}, 2, 1), ('mixed', {DenseVector, SparseVector}, 2, 2)] for expected_col_info in expected: col_name, col_types, col_shape, col_size = expected_col_info assert all_col_types[col_name] == col_types, col_name assert col_shapes[col_name] == col_shape, col_name assert col_max_sizes[col_name] == col_size, col_name
def test_check_shape_compatibility(self): feature_columns = ['x1', 'x2', 'features'] label_columns = ['y1', 'y_embedding'] schema = StructType([StructField('x1', DoubleType()), StructField('x2', IntegerType()), StructField('features', VectorUDT()), StructField('y1', FloatType()), StructField('y_embedding', VectorUDT())]) data = [[1.0, 1, DenseVector([1.0] * 12), 1.0, DenseVector([1.0] * 12)]] * 10 with spark_session('test_df_cache') as spark: df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) input_shapes = [[1], [1], [-1, 3, 4]] output_shapes = [[1], [-1, 3, 4]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) input_shapes = [[1], [1], [3, 2, 2]] output_shapes = [[1, 1], [-1, 2, 3, 2]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) bad_input_shapes = [[1], [1], [-1, 3, 5]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_input_shapes = [[2], [1], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_output_shapes = [[7], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, bad_output_shapes)
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
def newFeatures(row): vector1 = row['tf_idf'] vector2 = row['tf_idfs'] cos = 0 try: cos = vector1.dot(vector2) / (sf.sqrt( vector1.dot(vector1) * vector2.dot(vector2))) except: pass data = row.asDict() data['features'] = DenseVector([cos]) newRow = Row(*data.keys()) newRow = newRow(*data.values()) return newRow