def test_recursive_struct_validation(self, spark: SQLContext): nested_nested_schema = StructType([ StructField("num_2", IntegerType()), StructField("arr_2", ArrayType(StringType())), ]) nested_schema = StructType([ StructField("num_1", IntegerType()), StructField("arr_1", ArrayType(StringType())), StructField("col_c", nested_nested_schema), ]) schema = StructType([ StructField("col_a", nested_schema), StructField("col_b", nested_schema) ]) # rows are represented as tuples, whereas arrays are lists a = [ ( (1, ["a1", "b1", "c1"], (11, ["x1", "y1", "z1"])), (2, ["a2", "b2", "c2"], (12, ["x2", "y2", "z2"])), ), ( (3, ["a3", "b3", "c3"], (13, ["x3", "y3", "z3"])), (4, ["a4", "b4", "c4"], (14, ["x4", "y4", "z4"])), ), ] # same as a but with one wrong val b = [ ( (1, ["a1", "b1", "c1"], (11, ["x1", "y1", "z1"])), (2, ["a2", "b2", "c2"], (12, ["x2", "y2", "z2"])), ), ( (3, ["a3", "b3", "c3"], (13, ["x3", "y3", "WRONG VAL"])), (4, ["a4", "b4", "c4"], (14, ["x4", "y4", "z4"])), ), ] df = spark.createDataFrame(a, schema) print("res") print(to_tuples(df)) # should be the exact specification defined by the to_tuples helper assert to_tuples(df) == a # should pass the validation helper validators.validate_values(df, schema, a) # should fail with an incorrect nested value with pytest.raises(ValueError): validators.validate_values(df, schema, b)
def test_token_vectors_pipeline(self, spark: SQLContext): input_data = [ ("foo bar baz biz", ), ("foo baz bar", ), ("bar baz ", ), (" foo biz ", ), ("", ), (None, ), ] raw = spark.createDataFrame(input_data, ["text"]) res = tfidf.token_vectors_pipeline("text", "vectors", raw) actual = to_tuples(res.select("vectors")) row_0 = set(actual[0][0]) row_1 = set(actual[1][0]) row_2 = set(actual[2][0]) row_3 = set(actual[3][0]) row_4 = set(actual[4][0]) row_5 = set(actual[5][0]) assert len(row_0) == 4 assert len(row_1) == 3 assert len(row_2) == 2 assert len(row_3) == 2 assert len(row_4) == 0 assert len(row_5) == 0 assert row_1.issubset(row_0) assert row_2.issubset(row_0) assert row_3.issubset(row_0) assert len(row_1.intersection(row_2)) == 2 assert len(row_2.intersection(row_3)) == 0
def test_tf_ngrams_pipeline(self, spark: SQLContext): input_data = [ ("foo bar baz biz", ), ("foo baz bar", ), ("bar baz", ), ("foo biz", ), ("", ), (None, ), ] raw = spark.createDataFrame(input_data, ["text"]) res = tfidf.tf_ngrams_pipeline("text", "vectors", raw) actual = [i[0] for i in to_tuples(res.select("vectors"))] for v in actual: assert isinstance(v, SparseVector) row_0 = set(actual[0].indices) row_1 = set(actual[1].indices) row_2 = set(actual[2].indices) row_3 = set(actual[3].indices) row_4 = set(actual[4].indices) row_5 = set(actual[4].indices) assert row_1.issubset(row_0) assert row_2.issubset(row_0) assert row_3.issubset(row_0) assert len(row_4) == 0 assert len(row_5) == 0
def get_spacy_docs( document_id_col: str, document_text_col: str, df: DataFrame, spacy_model_version="en_core_web_lg", ): """Retrieve the spacy docs as a dataframe. Note that this is done in the driver""" log.info("initate spacy pipeline") # select both the document id (can be a row number for instance) # as well as the raw document text raw = to_tuples(df.select(F.col(document_id_col), F.col(document_text_col))) # load spacy nlp = get_spacy(spacy_model_version) # each entry is a tuple of (text, context) where the context is a dictionary raw_texts = [i if isinstance(i, str) else "" for _, i in raw] # use the spacy pipe method to process all the docs at once docs = list(nlp.pipe(raw_texts)) # set the id as an "extension attribute" on each doc object Doc.set_extension(DOCUMENT_ID, default=None) for i in range(len(raw_texts)): docs[i]._.document_id = raw[i][0] return docs
def test_to_tuples(self, spark): cols = ["col_a", "col_b", "col_c"] data = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")] df = spark.createDataFrame(data, cols) expected = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")] assert dataframe.to_tuples(df) == expected
def test_remove_empty_strings(self, spark): data = [(["foo", "bar", "", None],), (["", None],), (None,)] df = spark.createDataFrame(data, ["input"]) res = df.withColumn("output", arrays.remove_empty_strings(F.col("input"))) expected = [a + b for a, b in zip(data, [(["foo", "bar"],), ([],), ([],)])] assert to_tuples(res) == expected
def test_utc_timestamps(self, spark): t = timestamp.utcnow() e = timestamp.format_timestamp(t) data = [("a",), ("b",), ("c",)] raw = spark.createDataFrame(data, ["key"]) df = timestamp.with_timestamp("val", t, raw) for row in dataframe.to_tuples(df): assert row[1] == e
def test_snowball_stemmer(self, spark: SQLContext): input_col = "tokens" output_col = "res" raw = self._get_stemmer_input(spark, input_col) df = tokens.snowball_tokens(input_col, output_col, raw) expected = [ (["i", "may", "be", "use"], ), (["a", "simplist", "stem", "algorithm"], ), (["but", "the", "result", "are", "great"], ), ] assert to_tuples(df.select(output_col)) == expected
def test_non_utc_timestamps(self, spark): pass au = pytz.timezone("Australia/Sydney") t1 = timestamp.utcnow() e = timestamp.format_timestamp(t1) t2 = t1.astimezone(au) data = [("a",), ("b",), ("c",)] raw = spark.createDataFrame(data, ["key"]) df = timestamp.with_timestamp("val", t2, raw) for row in dataframe.to_tuples(df): assert row[1] == e
def test_normalize_dense_vectors(self, spark: SQLContext): input_data = [(Vectors.dense([1, 4, 16]), ), (Vectors.dense(1, 0, 9), )] df = spark.createDataFrame(input_data, ["vectors"]) res = vectors.normalize_vectors("vectors", "normalized", df).select("normalized") vals = [i[0].toArray() for i in to_tuples(res)] # after being normalized the magnitude of each vector should be 1 magnitudes = [np.linalg.norm(v) for v in vals] expected = [1.0 for _ in range(len(magnitudes))] # some magnitudes migth come out as 0.999999 etc self.validate_to_decimal_places(magnitudes, expected)
def test_cosine_similarity(self, spark: SQLContext): input_data = [ # two_dim_normals (Vectors.dense([1, 0]), Vectors.dense([0, 1])), # three_dim_normals (Vectors.dense([0, 1, 0]), Vectors.dense([0, 0, 1])), # two_dim_colinear (Vectors.dense([1, 0]), Vectors.dense([1, 0])), # three_dim_colinear (Vectors.dense([1, 1, 0]), Vectors.dense([1, 1, 0])), ] df = spark.createDataFrame(input_data, ["col_a", "col_b"]) res = distance.cosine_similarity("col_a", "col_b", "col_c", df) actual = [i[0] for i in to_tuples(res.select("col_c"))] expected = [0.0, 0.0, 1.0, 1.0] self.validate_to_decimal_places(actual, expected, decimal_places=6)
def test_stemmed_token_vectors_pipeline(self, spark: SQLContext): # the exact same set up as the previous test however we are expecting # the stemmer to reduce these 4 words to: run, walk, jog, sprint input_data = [ ("running walks jogged sprinted", ), ("runs jogging walked", ), ("walking jogs", ), ("running sprinting", ), ("", ), (None, ), ] raw = spark.createDataFrame(input_data, ["text"]) res = tfidf.token_vectors_pipeline("text", "vectors", raw, stemmer_func=tokens.porter_tokens) actual = to_tuples(res.select("vectors")) row_0 = set(actual[0][0]) row_1 = set(actual[1][0]) row_2 = set(actual[2][0]) row_3 = set(actual[3][0]) row_4 = set(actual[4][0]) row_5 = set(actual[5][0]) assert len(row_0) == 4 assert len(row_1) == 3 assert len(row_2) == 2 assert len(row_3) == 2 assert len(row_4) == 0 assert len(row_5) == 0 assert row_1.issubset(row_0) assert row_2.issubset(row_0) assert row_3.issubset(row_0) assert len(row_1.intersection(row_2)) == 2 assert len(row_1.intersection(row_3)) == 1 assert len(row_2.intersection(row_3)) == 0
def validate_values( df: DataFrame, expected_schema, expected_values: list, enforce_array_order=True, verbose=False, ): """Validate that the dataframe contains an exact list of rows and columns""" # validate the expected columns validate_schema(df, expected_schema, verbose=verbose) row_count = df.count() if row_count == 0: raise DataFrameException("DataFrame has 0 rows") if row_count != len(expected_values): raise DataFrameException( f"Incorrect number of rows: Received {row_count} - Expected: {len(expected_values)}" ) res = to_tuples(df) col_count = len(res[0]) for row_index, expected in enumerate(expected_values): actual = res[row_index] if verbose: print("Actual:") print(actual) print("Expected:") print(expected) # should have the same number of columns in each row if len(actual) != len(expected): raise DataFrameException( f"Incorrect number of columns: Received {len(actual)} - Expected: {len(expected)}" ) for col_index in range(col_count): _recursive_validator( actual[col_index], expected[col_index], enforce_array_order=enforce_array_order, )
def test_select_longest_string(self, spark): # group the rows by the partiton col partition_col = "partition_col" # rank the rows by the aggregation col agg_col = "agg_col" # more than 1 string might have the same length # alphabetical ordering should be used to break the tie data = [ ("A", "longest_a"), ("A", "longest_a"), # duplicate ("A", "longest_b"), ("A", "short"), ("A", None), ("B", "longest_x"), ("B", "longest_x"), # duplicate ("B", "longest_y"), ("B", "longest_y"), ("B", "short"), ("B", None), ("B", None), ("C", None), ("C", None), ] raw = spark.createDataFrame(data, [partition_col, agg_col]) window = Window.partitionBy("partition_col") df = strings.select_longest_string("agg_col", window, raw).orderBy( "partition_col" ) assert df.count() == 3 res = to_tuples(df) expected = [["A", "longest_a"], ["B", "longest_x"], ["C", None]] for i, expected in enumerate(expected): actual = res[i] assert actual[0] == expected[0] assert actual[1] == expected[1]
def test_normalize_sparse_vectors(self, spark: SQLContext): # based on the following imaginary tokens # [a, c, c] # [a, a, b] # [b, b, d] input_data = [ (Vectors.sparse(4, [0, 2], [1.0, 2.0]), ), (Vectors.sparse(4, [0, 1], [2.0, 1.0]), ), (Vectors.sparse(4, [1, 3], [2.0, 1.0]), ), ] df = spark.createDataFrame(input_data, ["vectors"]) res = vectors.normalize_vectors("vectors", "normalized", df).select("normalized") vals = [i[0].toArray() for i in to_tuples(res)] # after being normalized the magnitude of each vector should be 1 magnitudes = [np.linalg.norm(v) for v in vals] expected = [1.0 for _ in range(len(magnitudes))] # some magnitudes migth come out as 0.999999 etc self.validate_to_decimal_places(magnitudes, expected)