Пример #1
0
    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro"""

        # check if required Python and Java libraries are made available to worker nodes
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegexpMatches(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )
        if re.search('spark://', sc.getConf().get('spark.master', '')):
            py_mods = utils.get_file_path('py_mods')
            self.assertRegexpMatches(
                sc.getConf().get('spark.submit.pyFiles', ''), py_mods,
                'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode'
            )
            self.assertRegexpMatches(
                sc.getConf().get('spark.files', ''), py_mods,
                'Eskapade modules missing from spark.files, needed in Spark cluster mode'
            )

        # run Eskapade
        self.run_eskapade('esk608_spark_histogrammar.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        settings = ProcessManager().service(ConfigObject)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data', settings.io_conf(),
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Пример #2
0
    def test_esk602(self):
        """Test Esk-602: Read CSV files into a Spark data frame"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk602_read_csv_to_spark_df.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].rdd.getNumPartitions(), 5,
                         'unexpected number of partitions in data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(ds['spark_df'].columns, ['date', 'loc', 'x', 'y'],
                             'unexpected columns in data frame')
        self.assertSetEqual(
            set((r['date'], r['loc']) for r in ds['spark_df'].collect()),
            set([(20090101, 'a'), (20090102, 'b'), (20090103, 'c'),
                 (20090104, 'd'), (20090104, 'e'), (20090106, 'a'),
                 (20090107, 'b'), (20090107, 'c'), (20090107, 'd'),
                 (20090108, 'e'), (20090109, 'e'), (20090109, 'f')]),
            'unexpected values in date/loc columns')
Пример #3
0
    def test_esk607(self):
        """Test Esk-607: Add column to Spark dataframe"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk607_spark_with_column.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('new_spark_df', ds,
                      'no object with key "new_spark_df" in data store')
        self.assertIsInstance(ds['new_spark_df'], pyspark.sql.DataFrame,
                              '"new_spark_df" is not a Spark data frame')
        self.assertEqual(ds['new_spark_df'].count(), 5,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(
            ds['new_spark_df'].columns,
            ['dummy', 'date', 'loc', 'x', 'y', 'pow_xy1', 'pow_xy2'],
            'unexpected columns in data frame')
        self.assertSetEqual(
            set(tuple(r) for r in ds['new_spark_df'].collect()),
            set([('bla', 20090103, 'c', 5, 7, 78125.0, 78125.0),
                 ('bal', 20090102, 'b', 3, 8, 6561.0, 6561.0),
                 ('flo', 20090104, 'e', 3, 5, 243.0, 243.0),
                 ('bar', 20090101, 'a', 1, 9, 1.0, 1.0),
                 ('foo', 20090104, 'd', 1, 6, 1.0, 1.0)]),
            'unexpected values in columns')
Пример #4
0
    def test_esk604(self):
        """Test Esk-604: Execute Spark-SQL query"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk604_spark_execute_query.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # check data frame
        self.assertIn('spark_df_sql', ds,
                      'no object with key "spark_df_sql" in data store')
        self.assertIsInstance(ds['spark_df_sql'], pyspark.sql.DataFrame,
                              '"spark_df_sql" is not a Spark data frame')
        self.assertEqual(ds['spark_df_sql'].count(), 4,
                         'unexpected number of rows in filtered data frame')
        self.assertListEqual(ds['spark_df_sql'].columns,
                             ['loc', 'sumx', 'sumy'],
                             'unexpected columns in data frame')
        self.assertEqual(
            ds['spark_df_sql'].schema,
            proc_mgr.get_chain('ApplySQL').get_link('SparkSQL').schema,
            'schema of data frame does not correspond to schema stored in link'
        )
        self.assertSetEqual(
            set(tuple(r) for r in ds['spark_df_sql'].collect()),
            set([('e', 10, 15), ('d', 2, 11), ('b', 6, 16), ('a', 2, 18)]),
            'unexpected values in loc/sumx/sumy columns')
Пример #5
0
    def test_esk603(self):
        """Test Esk-603: Write Spark data to CSV"""

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # run Eskapade
        self.run_eskapade('esk603_write_spark_data_to_csv.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # read output data
        results_data_path = persistence.io_dir(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf())
        names = []
        headers = []
        contents = []
        csv_dirs = glob.glob('{}/*'.format(results_data_path))
        self.assertEqual(len(csv_dirs), 3,
                         'expected to find three CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            self.assertEqual(
                len(csv_files), 1,
                'expected to find only one CSV file in "{}"'.format(names[-1]))
            with open(csv_files[0]) as csv:
                contents.append([l.strip().split(',') for l in csv])
                self.assertEqual(
                    len(contents[-1]), 101,
                    'unexpected number of lines in "{}" CSV'.format(names[-1]))
                headers.append(contents[-1][0])
                contents[-1] = sorted(contents[-1][1:])

        # check output data
        self.assertListEqual(headers[0], ['index', 'foo', 'bar'],
                             'unexpected CSV header for "{}"'.format(names[0]))
        self.assertListEqual(
            contents[0],
            sorted([str(it), 'foo{:d}'.format(it),
                    str((it + 1) / 2.)] for it in range(100)),
            'unexpected CSV content for "{}"'.format(names[0]))
        for name, head, cont in zip(names[1:], headers[1:], contents[1:]):
            self.assertListEqual(
                head, headers[0],
                'CSV header of "{0:s}" differs from header of "{1:s}"'.format(
                    name, names[0]))
            self.assertListEqual(
                cont, contents[0],
                'CSV content of "{0:s}" differs from content of "{1:s}"'.
                format(name, names[0]))
Пример #6
0
    def test_esk610(self):
        """Test Esk-610: Spark Streaming word count"""

        # this test relies on linux shell scripts to create file stream
        if (sys.platform != 'linux') and (sys.platform != 'darwin'):
            print('skipping test_esk610 for non-unix {} platform'.format(
                sys.platform))
            return

        # check if running in local mode
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )

        # create test dir
        tmpdir = '/tmp/eskapade_stream_test'
        os.mkdir(tmpdir)

        # create a file stream
        tmpfile = ''.join(
            random.choice(string.ascii_lowercase) for x in range(8))
        cmd = 'for i in $(seq -f \"%05g\" 0 1000); \
                do echo \'Hello world\' > "{}"/"{}"_$i.dummy; \
                        sleep 1; done'.format(tmpdir, tmpfile)
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)

        # run eskapade
        self.run_eskapade('esk610_spark_streaming_wordcount.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # end file stream
        p.kill()

        # check if file stream was properly executed
        stdout, stderr = p.communicate()
        self.assertEqual(stdout, b'',
                         'unexpected stdout output {}'.format(stdout))
        self.assertEqual(stderr, b'',
                         'unexpected stderr output {}'.format(stderr))

        # check if stream was setup correctly (that's all we can do - the data itself is gone)
        self.assertIsInstance(ds['dstream'], pyspark.streaming.DStream)

        # read and check output data
        results_data_path = persistence.io_dir(
            'results_data',
            proc_mgr.service(ConfigObject).io_conf())
        names = []
        contents = []
        csv_dirs = glob.glob(
            '{}/dstream/wordcount-*.txt'.format(results_data_path))
        self.assertGreater(len(csv_dirs), 0,
                           'expected to find CSV output directories')
        for csv_dir in csv_dirs:
            names.append(os.path.basename(csv_dir))
            csv_files = glob.glob('{}/part*'.format(csv_dir))
            #self.assertEqual(len(csv_files), 1, 'expected to find exactly one CSV file in "{}"'.format(names[-1]))
            if len(csv_files) > 0:
                with open(csv_files[0]) as csv:
                    record = [l for l in csv]
                    if record != []:  # empty records are allowed (because of timing differences)
                        self.assertRegexpMatches(
                            record[0], 'Hello',
                            'Expected \'Hello\' as in \'Hello world\'')
                        self.assertRegexpMatches(
                            record[1], 'world',
                            'Expected \'world\' as in \'Hello world\'')
                    contents.append(record[:])
        self.assertGreater(
            len(contents), 0,
            'expected ~ten items (each second a streaming RDD) - depending on timing'
        )

        # clean up files
        shutil.rmtree(tmpdir)