class TestWriteCassandra(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ CassandraFixture( 'cassandra.docker', absolute_path(__file__, 'resources', 'test_write', 'cassandra_setup.cql'), absolute_path(__file__, 'resources', 'test_write', 'cassandra_teardown.cql'), ) ] def test_write_cassandra(self): df = self.spark.createDataFrame(TEST_DATA) df.write_ext.cassandra( host='cassandra.docker', port=9042, keyspace='sparkly_test', table='test_writer', consistency='ONE', mode='overwrite', ) written_df = self.spark.read_ext.by_url( 'cassandra://cassandra.docker/' 'sparkly_test/test_writer' '?consistency=ONE' ) self.assertDataFrameEqual(written_df, TEST_DATA)
class TestElastic6Fixture(SparklyTest): session = SparklyTestSessionWithES6 class_fixtures = [ ElasticFixture( 'elastic6.docker', 'sparkly_test_fixture', 'test', absolute_path(__file__, 'resources', 'test_fixtures', 'mapping.json'), absolute_path(__file__, 'resources', 'test_fixtures', 'data.json'), ) ] def test_elastic_fixture(self): df = self.spark.read_ext.by_url( 'elastic://elastic6.docker/sparkly_test_fixture/test?es.read.metadata=false' ) self.assertDataFrameEqual(df, [ { 'name': 'John', 'age': 56 }, ])
class TestWriteMysql(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ MysqlFixture( 'mysql.docker', 'root', None, absolute_path(__file__, 'resources', 'test_write', 'mysql_setup.sql'), absolute_path(__file__, 'resources', 'test_write', 'mysql_teardown.sql'), ) ] def test_write_mysql(self): df = self.spark.createDataFrame(TEST_DATA) df.write_ext.mysql( host='mysql.docker', port=3306, database='sparkly_test', table='test_writer', mode='overwrite', options={'user': '******', 'password': ''} ) df = self.spark.read_ext.by_url( 'mysql://mysql.docker/' 'sparkly_test/test_writer' '?user=root&password=' ) self.assertDataFrameEqual(df, TEST_DATA)
class TestWriteCassandra(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ CassandraFixture( 'cassandra.docker', absolute_path(__file__, 'resources', 'test_write', 'cassandra_setup.cql'), absolute_path(__file__, 'resources', 'test_write', 'cassandra_teardown.cql'), ) ] def test_write_cassandra(self): df = self.spark.createDataFrame(TEST_DATA) df.write_ext.cassandra( host='cassandra.docker', port=9042, keyspace='sparkly_test', table='test_writer', consistency='ONE', mode='overwrite', # overwrite would first perform truncation. # Either change mode to 'append' to change data already in the table, # or set confirm.truncate to true options={'confirm.truncate': True}, ) written_df = self.spark.read_ext.by_url( 'cassandra://cassandra.docker/' 'sparkly_test/test_writer' '?consistency=ONE' ) self.assertDataFrameEqual(written_df, TEST_DATA)
def test_cassandra_fixture(self): data_in_cassandra = CassandraFixture( 'cassandra.docker', absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_setup.cql'), absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_teardown.cql'), ) with data_in_cassandra: df = self.spark.read_ext.by_url( 'cassandra://cassandra.docker/sparkly_test/test') self.assertDataFrameEqual(df, [ { 'uid': '1', 'countries': { 'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206 }, }, ], fields=['uid', 'countries'])
class TestMysqlFixtures(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ MysqlFixture( 'mysql.docker', 'root', None, absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_setup.sql'), absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_teardown.sql'), ) ] def test_mysql_fixture(self): df = self.spark.read_ext.by_url( 'mysql://mysql.docker/sparkly_test/test?user=root&password='******'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111 }, ])
class SparklyReaderCassandraTest(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ CassandraFixture( 'cassandra.docker', absolute_path(__file__, 'resources', 'test_read', 'cassandra_setup.cql'), absolute_path(__file__, 'resources', 'test_read', 'cassandra_teardown.cql'), ) ] def test_read(self): df = self.spark.read_ext.cassandra( host='cassandra.docker', port=9042, keyspace='sparkly_test', table='test', consistency='ONE', ) self.assertDataFrameEqual(df, [{ 'countries': { 'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3 }, 'uid': '1', 'created': '1234567894', }, { 'countries': { 'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3 }, 'uid': '2', 'created': '1234567893', }, { 'countries': { 'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3 }, 'uid': '3', 'created': '1234567891', }])
class TestKafkaFixture(SparklyGlobalSessionTest): session = SparklyTestSession topic = 'sparkly.test.fixture.{}'.format(uuid.uuid4().hex[:10]) fixtures = [ KafkaFixture( 'kafka.docker', topic=topic, key_serializer=lambda item: json.dumps(item).encode('utf-8'), value_serializer=lambda item: json.dumps(item).encode('utf-8'), data=absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json'), ) ] def test_kafka_fixture(self): consumer = KafkaConsumer( self.topic, bootstrap_servers='kafka.docker:9092', key_deserializer=lambda item: json.loads(item.decode('utf-8')), value_deserializer=lambda item: json.loads(item.decode('utf-8')), auto_offset_reset='earliest', ) actual_data = [] for i in range(5): message = next(consumer) data = {'key': message.key, 'value': message.value} actual_data.append(data) expected_data = self.spark.read.json( absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json')) self.assertDataFrameEqual(expected_data, actual_data)
class TestWriteElastic7(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ ElasticFixture( 'elastic7.docker', 'sparkly_test', None, None, absolute_path(__file__, 'resources', 'test_write', 'elastic7_setup.json'), ), ] def test_write_elastic(self): df = self.spark.createDataFrame(TEST_DATA) df.write_ext.elastic( host='elastic7.docker', port=9200, es_index='sparkly_test', es_type=None, mode='overwrite', options={ 'es.mapping.id': 'uid', }, ) df = self.spark.read_ext.by_url( 'elastic://elastic7.docker/sparkly_test?es.read.metadata=false', ) self.assertDataFrameEqual(df, TEST_DATA)
class SparklyReaderElastic7Test(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ ElasticFixture( 'elastic7.docker', 'sparkly_test', None, None, absolute_path(__file__, 'resources', 'test_read', 'elastic7_setup.json'), ) ] def test_elastic(self): df = self.spark.read_ext.elastic( host='elastic7.docker', port=9200, es_index='sparkly_test', es_type=None, query='?q=name:*Smith*', options={ 'es.read.field.as.array.include': 'topics', 'es.read.metadata': 'false', }, ) self.assertDataFrameEqual(df, ELASTIC_TEST_DATA)
class SparklyReaderMySQLTest(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ MysqlFixture( 'mysql.docker', 'root', None, absolute_path(__file__, 'resources', 'test_read', 'mysql_setup.sql'), absolute_path(__file__, 'resources', 'test_read', 'mysql_teardown.sql'), ) ] def test_read_mysql(self): df = self.spark.read_ext.mysql(host='mysql.docker', database='sparkly_test', table='test', options={ 'user': '******', 'password': '', }) self.assertDataFrameEqual(df, [ { 'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111 }, { 'id': 2, 'name': 'john', 'surname': 'po', 'age': 222 }, { 'id': 3, 'name': 'john', 'surname': 'ku', 'age': 333 }, ])
class TestElastic7Fixture(SparklyGlobalSessionTest): session = SparklyTestSession class_fixtures = [ ElasticFixture( 'elastic7.docker', 'sparkly_test_fixture', None, absolute_path(__file__, 'resources', 'test_fixtures', 'mapping.json'), absolute_path(__file__, 'resources', 'test_fixtures', 'data_for_es7.json'), ) ] def test_elastic_fixture(self): df = self.spark.read_ext.by_url( 'elastic://elastic7.docker/sparkly_test_fixture?es.read.metadata=false' ) self.assertDataFrameEqual(df, [{'name': 'John', 'age': 56}])
def setUp(self): self.json_decoder = lambda item: json.loads(item.decode('utf-8')) self.json_encoder = lambda item: json.dumps(item).encode('utf-8') self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10]) self.fixture_path = absolute_path( __file__, '..', 'integration', 'resources', 'test_write', 'kafka_setup.json', ) self.expected_data = self.spark.read.json(self.fixture_path)
class SparklyReaderElasticTest(SparklyGlobalSessionTest): session = SparklyTestSession fixtures = [ ElasticFixture( 'elastic.docker', 'sparkly_test', 'test', None, absolute_path(__file__, 'resources', 'test_read', 'elastic_setup.json'), ) ] def test_elastic(self): df = self.spark.read_ext.elastic( host='elastic.docker', port=9200, es_index='sparkly_test', es_type='test', query='?q=name:*Smith*', options={ 'es.read.field.as.array.include': 'topics', 'es.read.metadata': 'false', }, ) self.assertDataFrameEqual(df, [{ 'name': 'Smith3', 'topics': [1, 4, 5], 'age': 31, 'demo': { 'age_30': 110, 'age_10': 50, } }, { 'name': 'Smith4', 'topics': [4, 5], 'age': 12, 'demo': { 'age_30': 20, 'age_10': 1, } }])
def test_kafka_fixture(self): consumer = KafkaConsumer( self.topic, bootstrap_servers='kafka.docker:9092', key_deserializer=lambda item: json.loads(item.decode('utf-8')), value_deserializer=lambda item: json.loads(item.decode('utf-8')), auto_offset_reset='earliest', ) actual_data = [] for i in range(5): message = next(consumer) data = {'key': message.key, 'value': message.value} actual_data.append(data) expected_data = self.spark.read.json( absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json')) self.assertDataFrameEqual(expected_data, actual_data)
def setUp(self): self.json_decoder = lambda item: json.loads(item.decode('utf-8')) self.json_encoder = lambda item: json.dumps(item).encode('utf-8') self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10]) self.fixture_path = absolute_path(__file__, 'resources', 'test_read', 'kafka_setup.json') self.fixture = KafkaFixture( 'kafka.docker', topic=self.topic, key_serializer=self.json_encoder, value_serializer=self.json_encoder, data=self.fixture_path, ) self.fixture.setup_data() self.expected_data_df = self.spark.read.json(self.fixture_path) self.expected_data = [ item.asDict(recursive=True) for item in self.expected_data_df.collect() ]
class SparklyTestSession(SparklySession): packages = [ 'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11', 'org.elasticsearch:elasticsearch-spark-20_2.11:5.1.1', 'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0', 'mysql:mysql-connector-java:5.1.39', 'io.confluent:kafka-avro-serializer:3.0.1', ] repositories = [ 'http://packages.confluent.io/maven/', ] jars = [ absolute_path(__file__, 'resources', 'brickhouse-0.7.1.jar'), ] udfs = { 'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF', 'length_of_text': (lambda text: len(text), StringType()) }
def get_test_data(self, filename): file_path = absolute_path(__file__, 'resources', 'test_testing', filename) df = self.spark.read.json(file_path) data = [item.asDict(recursive=True) for item in df.collect()] return df, data