示例#1
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding="utf-8") as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_data = []
        with open(parquet_file, "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(expected[i],
                    actual[c].decode('utf-8') if type(actual[c]) is bytes \
                    # this makes '0' = 0, since csv reads all strings.

                    else str(actual[c]))
示例#2
0
def validate_parquet_file(schema, input_file, verbose):
    """Validate multiple messages stored in input Parquet file."""
    processed = 0
    valid = 0
    invalid = 0
    error = 0

    try:
        with open(input_file, "rb") as fo:
            # iterate over all records in the Parquet file
            for row in parquet.DictReader(fo):
                processed += 1
                try:
                    try_to_validate_message_from_parquet(
                        schema, row, processed, verbose)
                    valid += 1
                except (ValueError, Invalid) as ve:
                    invalid += 1
                    print("Validation error: " + str(ve))
                    print(row)
                except Exception as e:
                    print("Other problem: " + str(e))
                    error += 1

    except IOError as e:
        print("File-related problem: " + str(e))
        error += 1

    return {
        "processed": processed,
        "valid": valid,
        "invalid": invalid,
        "error": error
    }
示例#3
0
    def test_converted_type_null(self):
        """Test reading a file that contains null records for a plain column that is converted to utf-8."""
        with open(os.path.join(TEST_DATA, "test-converted-type-null.parquet"),
                  "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.assertListEqual(
            # this is the contents of test-converted-type-null.parquet. 2 records.
            [{
                "foo": "bar"
            }, {
                "foo": None
            }],
            actual_data)
示例#4
0
    def test_null_int(self):
        with open(os.path.join(TEST_DATA, "test-null.parquet"),
                  "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.assertListEqual(
            # this is the contents of test-null.parquet. Two records, one that is null.
            [{
                "foo": 1,
                "bar": 2
            }, {
                "foo": 1,
                "bar": None
            }],
            actual_data)
示例#5
0
    def test_null_plain_dictionary(self):
        """Test reading a file that contains null records for a plain dictionary column."""
        with open(os.path.join(TEST_DATA, "test-null-dictionary.parquet"),
                  "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.assertListEqual(
            # this is the contents of test-null-dictionary.parquet. 7 records.
            # The first record is null, and the rest alternate between values of 'bar' and 'baz.'
            [{
                "foo": None
            }] + [{
                "foo": "bar"
            }, {
                "foo": "baz"
            }] * 3,
            actual_data)
示例#6
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_data = []
        with open(parquet_file) as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(expected[i], str(actual[c]))
示例#7
0
import parquet
import pandas as pd

home = os.path.expanduser("~")
dir = "/media/sumeyer/SSD_2/ML_DATA/"
filename = "part-r-00000-67ebd6f0-bfb4-42e0-b516-d7aaa77cbcb8.snappy.parquet"
datafile = dir + filename

print("open file : ", datafile)

## assuming parquet file with two rows and three columns:
## foo bar baz
## 1   2   3
## 4   5   6

with open(datafile) as fo:
    # prints:
    # {"foo": 1, "bar": 2}
    # {"foo": 4, "bar": 5}
    for row in parquet.DictReader(fo):
        print(json.dumps(row))

with open(datafile) as fo:
    # prints:
    # 1,2
    # 4,5
    for row in parquet.reader(fo):
        print(",".join([str(r) for r in row]))

print(df.info())
print(df)
def test_mapreduce_executor_avro_to_parquet(sdc_builder, sdc_executor, cluster, use_el_expression):
    """Test MapReduce executor stage when converting avro to parquet. Parquet version dependencies must be set
     accordingly to avro version dependencies, else parquet files are created with no content, and exceptions are
     thrown in hadoop application syslog (test made specifically to test mapr clusters, as CDH clusters used to have
     these dependencies correctly set up before).
     Similar to test above.
     Input file is always an EL by default. To test it hardcoded, we have
     test_mapreduce_executor_avro_to_parquet_fail_if_tmp_file_exists
     After ingest the executor triggers MapReduce job which should convert the ingested HDFS Avro data to Parquet.
     The pipeline would look like:
        dev_raw_data_source >> (expression_evaluator >>) mapr_fs >= mapreduce
    """
    # Generate some data.
    product_data = [dict(name='iphone', price=649.99),
                    dict(name='pixel', price=649.89)]
    raw_data = ''.join([json.dumps(product) for product in product_data])
    avro_schema = ('{ "type" : "record", "name" : "STF", "fields" : '
                   '[ { "name" : "name", "type" : "string" }, { "name" : "price", "type" : "double" } ] }')

    mapr_fs_output_path = f'/tmp/out/{get_random_string()}'
    mapr_fs_output_path_config = "${record:attribute('/output')}" if use_el_expression else mapr_fs_output_path

    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_data,
                                                                                  stop_after_first_batch=True)
    mapr_fs = builder.add_stage('MapR FS', type='destination')
    mapr_fs.set_attributes(avro_schema=avro_schema, avro_schema_location='INLINE', data_format='AVRO',
                           directory_template=mapr_fs_output_path, files_prefix='avro', max_records_in_file=1)
    mapreduce = builder.add_stage(
        'MapReduce', type='executor').set_attributes(job_type='AVRO_PARQUET',
                                                     output_directory=mapr_fs_output_path_config,
                                                     mapreduce_configuration_directory='mapr')

    wiretap_hadoop = builder.add_wiretap()
    wiretap_mapreduce = builder.add_wiretap()

    if use_el_expression:
        expression_evaluator = builder.add_stage('Expression Evaluator')
        expression_evaluator.set_attributes(
            header_attribute_expressions=[{'attributeToSet': '/output',
                                           'headerAttributeExpression': mapr_fs_output_path}])
        dev_raw_data_source >> mapr_fs >= [expression_evaluator, wiretap_hadoop.destination]
        expression_evaluator >> mapreduce
    else:
        dev_raw_data_source >> mapr_fs >= [mapreduce, wiretap_hadoop.destination]

    mapreduce >= wiretap_mapreduce.destination

    pipeline = builder.build(title='MapReduce executor pipeline').configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    try:
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # First, assert mapr_fs files have been created with correct content
        mapr_fs_files = cluster.mapr_fs.client.list(str(mapr_fs_output_path))
        # assert events (MapReduce) generated
        assert len(mapr_fs_files) == len(product_data)

        # make sure MapReduce job is done and is successful
        for event in wiretap_mapreduce.output_records:
            job_id = event.field['job-id'].value
            job_id = job_id.replace('job', 'application')
            assert cluster.yarn.wait_for_job_to_end(job_id) == 'FINISHED'

        # assert parquet data is same as what is ingested
        for event in wiretap_hadoop.output_records:
            file_path = event.field['filepath'].value
            maprfs_parquet_file_path = f'{file_path}.parquet'
            maprfs_output = io.BytesIO()
            with cluster.mapr_fs.client.read(maprfs_parquet_file_path) as reader:
                maprfs_output.write(reader.read())
            maprfs_data = [row for row in parquet.DictReader(maprfs_output)]
            assert maprfs_data[0] in product_data
    finally:
        cluster.mapr_fs.client.delete(mapr_fs_output_path, recursive=True)
def test_mapreduce_executor_avro_to_parquet_tmp_file_exists(sdc_builder, sdc_executor, cluster,
                                                            overwrite_temporary_file):
    """Very similar to other tests. We will just check the pipeline works with and without the overwrite_temporary_file.
    If set to True, it will work even if we create a file with the same name. Else, it will do nothing and we will
    have no parquet file.
     After ingest the executor triggers MapReduce job which should convert the ingested HDFS Avro data to Parquet.
     The pipeline would look like:
        dev_raw_data_source >> mapr_fs >= mapreduce
    """
    # Generate some data.
    product_data = [dict(name='iphone', price=649.99)]
    raw_data = ''.join([json.dumps(product) for product in product_data])
    avro_schema = ('{ "type" : "record", "name" : "STF", "fields" : '
                   '[ { "name" : "name", "type" : "string" }, { "name" : "price", "type" : "double" } ] }')

    mapr_fs_output_path = f'/tmp/out/{get_random_string()}'

    builder_1 = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder_1.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                    raw_data=raw_data,
                                                                                    stop_after_first_batch=True)
    mapr_fs = builder_1.add_stage('MapR FS', type='destination')
    mapr_fs.set_attributes(avro_schema=avro_schema, avro_schema_location='INLINE', data_format='AVRO',
                           directory_template=mapr_fs_output_path, files_prefix='avro', max_records_in_file=1)

    wiretap_hadoop = builder_1.add_wiretap()

    dev_raw_data_source >> mapr_fs >= wiretap_hadoop.destination

    pipeline_1 = builder_1.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline_1)

    try:
        sdc_executor.start_pipeline(pipeline_1).wait_for_finished()

        # First, assert mapr_fs files have been created with correct content
        mapr_fs_files = cluster.mapr_fs.client.list(str(mapr_fs_output_path))
        # assert events (MapReduce) generated
        assert len(mapr_fs_files) == len(product_data)
        input_file_path = ''
        for event in wiretap_hadoop.output_records:
            input_file_path = event.field['filepath'].value

        # We create the tmp file based on the input file and the tmp_prefix used by the stage
        tmp_input_file_array = input_file_path.split('/')
        tmp_input_file_array[4] = TMP_PREFIX + tmp_input_file_array[4]
        tmp_input_file = '/'.join(tmp_input_file_array)
        cluster.mapr_fs.client.write(tmp_input_file, 'FILE CONTENTS')

        builder_2 = sdc_builder.get_pipeline_builder()

        dev_raw_data_source = builder_2.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                        raw_data=raw_data,
                                                                                        stop_after_first_batch=True)
        mapreduce = builder_2.add_stage(
            'MapReduce', type='executor').set_attributes(job_type='AVRO_PARQUET',
                                                         output_directory=mapr_fs_output_path,
                                                         mapreduce_configuration_directory='mapr',
                                                         # input file is not an EL this way
                                                         input_avro_file=input_file_path,
                                                         overwrite_temporary_file=overwrite_temporary_file)

        wiretap_mapreduce = builder_2.add_wiretap()

        dev_raw_data_source >> mapreduce >= wiretap_mapreduce.destination

        pipeline_2 = builder_2.build().configure_for_environment(cluster)
        sdc_executor.add_pipeline(pipeline_2)

        sdc_executor.start_pipeline(pipeline_2).wait_for_finished()

        # make sure MapReduce job is done and is successful
        for event in wiretap_mapreduce.output_records:
            job_id = event.field['job-id'].value
            job_id = job_id.replace('job', 'application')
            assert cluster.yarn.wait_for_job_to_end(job_id) == 'FINISHED'

        # assert parquet data is same as what is ingested
        for event in wiretap_hadoop.output_records:
            file_path = event.field['filepath'].value
            maprfs_parquet_file_path = f'{file_path}.parquet'
            maprfs_output = io.BytesIO()
            # with overwrite_temporary_file set to False, there should be no parquet file
            with cluster.mapr_fs.client.read(maprfs_parquet_file_path) as reader:
                maprfs_output.write(reader.read())
            maprfs_data = [row for row in parquet.DictReader(maprfs_output)]
            assert maprfs_data[0] in product_data
    except hdfs.util.HdfsError as e:
        if not overwrite_temporary_file:
            # meaning there is no parquet file as pipeline did not do anything
            assert input_file_path in e.message
        else:
            raise e
    finally:
        cluster.mapr_fs.client.delete(mapr_fs_output_path, recursive=True)
#!/usr/bin/env python
# encoding: utf-8

import json
import parquet
import glob

for filename in glob.glob("/tmp/tmp/*.parquet"):
    with open(filename) as fo:
        for row in parquet.DictReader(
                fo, columns=['px', 'py', 'x', 'y', 'distanceToLine']):
            print(json.dumps(row))
def _parquet(ctx, files):
    if not parquet:
        raise SystemExit("parquet module not found, please install manually")
    lines = chain(*(parquet.DictReader(x) for x in files))
    log('info', 'Loading into ElasticSearch')
    load(lines, ctx.obj)
示例#12
0
#    content = fs.read()
#s = str(content, 'utf-8')
#file = open("D:\data.csv", "w")
#file.write(s)
#df = pd.read_csv("D:\data.csv", names=COLUMNNAMES)
#print df



import parquet
import json

## assuming parquet file with two rows and three columns:
## foo bar baz
## 1   2   3
## 4   5   6

with open("test.parquet") as fo:
   # prints:
   # {"foo": 1, "bar": 2}
   # {"foo": 4, "bar": 5}
   for row in parquet.DictReader(fo, columns=['foo', 'bar']):
       print(json.dumps(row))


with open("test.parquet") as fo:
   # prints:
   # 1,2
   # 4,5
   for row in parquet.reader(fo, columns=['foo', 'bar]):
       print(",".join([str(r) for r in row]))