def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with io.open(csv_file, 'r', encoding="utf-8") as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_data = [] with open(parquet_file, "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.tc.assertEquals(len(expected_data), len(actual_data)) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): self.tc.assertEquals(len(expected), len(actual)) for i, c in enumerate([c for c in cols if c in actual]): self.tc.assertEquals(expected[i], actual[c].decode('utf-8') if type(actual[c]) is bytes \ # this makes '0' = 0, since csv reads all strings. else str(actual[c]))
def validate_parquet_file(schema, input_file, verbose): """Validate multiple messages stored in input Parquet file.""" processed = 0 valid = 0 invalid = 0 error = 0 try: with open(input_file, "rb") as fo: # iterate over all records in the Parquet file for row in parquet.DictReader(fo): processed += 1 try: try_to_validate_message_from_parquet( schema, row, processed, verbose) valid += 1 except (ValueError, Invalid) as ve: invalid += 1 print("Validation error: " + str(ve)) print(row) except Exception as e: print("Other problem: " + str(e)) error += 1 except IOError as e: print("File-related problem: " + str(e)) error += 1 return { "processed": processed, "valid": valid, "invalid": invalid, "error": error }
def test_converted_type_null(self): """Test reading a file that contains null records for a plain column that is converted to utf-8.""" with open(os.path.join(TEST_DATA, "test-converted-type-null.parquet"), "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.assertListEqual( # this is the contents of test-converted-type-null.parquet. 2 records. [{ "foo": "bar" }, { "foo": None }], actual_data)
def test_null_int(self): with open(os.path.join(TEST_DATA, "test-null.parquet"), "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.assertListEqual( # this is the contents of test-null.parquet. Two records, one that is null. [{ "foo": 1, "bar": 2 }, { "foo": 1, "bar": None }], actual_data)
def test_null_plain_dictionary(self): """Test reading a file that contains null records for a plain dictionary column.""" with open(os.path.join(TEST_DATA, "test-null-dictionary.parquet"), "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.assertListEqual( # this is the contents of test-null-dictionary.parquet. 7 records. # The first record is null, and the rest alternate between values of 'bar' and 'baz.' [{ "foo": None }] + [{ "foo": "bar" }, { "foo": "baz" }] * 3, actual_data)
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) actual_data = [] with open(parquet_file) as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.tc.assertEquals(len(expected_data), len(actual_data)) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): self.tc.assertEquals(len(expected), len(actual)) for i, c in enumerate([c for c in cols if c in actual]): self.tc.assertEquals(expected[i], str(actual[c]))
import parquet import pandas as pd home = os.path.expanduser("~") dir = "/media/sumeyer/SSD_2/ML_DATA/" filename = "part-r-00000-67ebd6f0-bfb4-42e0-b516-d7aaa77cbcb8.snappy.parquet" datafile = dir + filename print("open file : ", datafile) ## assuming parquet file with two rows and three columns: ## foo bar baz ## 1 2 3 ## 4 5 6 with open(datafile) as fo: # prints: # {"foo": 1, "bar": 2} # {"foo": 4, "bar": 5} for row in parquet.DictReader(fo): print(json.dumps(row)) with open(datafile) as fo: # prints: # 1,2 # 4,5 for row in parquet.reader(fo): print(",".join([str(r) for r in row])) print(df.info()) print(df)
def test_mapreduce_executor_avro_to_parquet(sdc_builder, sdc_executor, cluster, use_el_expression): """Test MapReduce executor stage when converting avro to parquet. Parquet version dependencies must be set accordingly to avro version dependencies, else parquet files are created with no content, and exceptions are thrown in hadoop application syslog (test made specifically to test mapr clusters, as CDH clusters used to have these dependencies correctly set up before). Similar to test above. Input file is always an EL by default. To test it hardcoded, we have test_mapreduce_executor_avro_to_parquet_fail_if_tmp_file_exists After ingest the executor triggers MapReduce job which should convert the ingested HDFS Avro data to Parquet. The pipeline would look like: dev_raw_data_source >> (expression_evaluator >>) mapr_fs >= mapreduce """ # Generate some data. product_data = [dict(name='iphone', price=649.99), dict(name='pixel', price=649.89)] raw_data = ''.join([json.dumps(product) for product in product_data]) avro_schema = ('{ "type" : "record", "name" : "STF", "fields" : ' '[ { "name" : "name", "type" : "string" }, { "name" : "price", "type" : "double" } ] }') mapr_fs_output_path = f'/tmp/out/{get_random_string()}' mapr_fs_output_path_config = "${record:attribute('/output')}" if use_el_expression else mapr_fs_output_path builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data, stop_after_first_batch=True) mapr_fs = builder.add_stage('MapR FS', type='destination') mapr_fs.set_attributes(avro_schema=avro_schema, avro_schema_location='INLINE', data_format='AVRO', directory_template=mapr_fs_output_path, files_prefix='avro', max_records_in_file=1) mapreduce = builder.add_stage( 'MapReduce', type='executor').set_attributes(job_type='AVRO_PARQUET', output_directory=mapr_fs_output_path_config, mapreduce_configuration_directory='mapr') wiretap_hadoop = builder.add_wiretap() wiretap_mapreduce = builder.add_wiretap() if use_el_expression: expression_evaluator = builder.add_stage('Expression Evaluator') expression_evaluator.set_attributes( header_attribute_expressions=[{'attributeToSet': '/output', 'headerAttributeExpression': mapr_fs_output_path}]) dev_raw_data_source >> mapr_fs >= [expression_evaluator, wiretap_hadoop.destination] expression_evaluator >> mapreduce else: dev_raw_data_source >> mapr_fs >= [mapreduce, wiretap_hadoop.destination] mapreduce >= wiretap_mapreduce.destination pipeline = builder.build(title='MapReduce executor pipeline').configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) try: sdc_executor.start_pipeline(pipeline).wait_for_finished() # First, assert mapr_fs files have been created with correct content mapr_fs_files = cluster.mapr_fs.client.list(str(mapr_fs_output_path)) # assert events (MapReduce) generated assert len(mapr_fs_files) == len(product_data) # make sure MapReduce job is done and is successful for event in wiretap_mapreduce.output_records: job_id = event.field['job-id'].value job_id = job_id.replace('job', 'application') assert cluster.yarn.wait_for_job_to_end(job_id) == 'FINISHED' # assert parquet data is same as what is ingested for event in wiretap_hadoop.output_records: file_path = event.field['filepath'].value maprfs_parquet_file_path = f'{file_path}.parquet' maprfs_output = io.BytesIO() with cluster.mapr_fs.client.read(maprfs_parquet_file_path) as reader: maprfs_output.write(reader.read()) maprfs_data = [row for row in parquet.DictReader(maprfs_output)] assert maprfs_data[0] in product_data finally: cluster.mapr_fs.client.delete(mapr_fs_output_path, recursive=True)
def test_mapreduce_executor_avro_to_parquet_tmp_file_exists(sdc_builder, sdc_executor, cluster, overwrite_temporary_file): """Very similar to other tests. We will just check the pipeline works with and without the overwrite_temporary_file. If set to True, it will work even if we create a file with the same name. Else, it will do nothing and we will have no parquet file. After ingest the executor triggers MapReduce job which should convert the ingested HDFS Avro data to Parquet. The pipeline would look like: dev_raw_data_source >> mapr_fs >= mapreduce """ # Generate some data. product_data = [dict(name='iphone', price=649.99)] raw_data = ''.join([json.dumps(product) for product in product_data]) avro_schema = ('{ "type" : "record", "name" : "STF", "fields" : ' '[ { "name" : "name", "type" : "string" }, { "name" : "price", "type" : "double" } ] }') mapr_fs_output_path = f'/tmp/out/{get_random_string()}' builder_1 = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder_1.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data, stop_after_first_batch=True) mapr_fs = builder_1.add_stage('MapR FS', type='destination') mapr_fs.set_attributes(avro_schema=avro_schema, avro_schema_location='INLINE', data_format='AVRO', directory_template=mapr_fs_output_path, files_prefix='avro', max_records_in_file=1) wiretap_hadoop = builder_1.add_wiretap() dev_raw_data_source >> mapr_fs >= wiretap_hadoop.destination pipeline_1 = builder_1.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline_1) try: sdc_executor.start_pipeline(pipeline_1).wait_for_finished() # First, assert mapr_fs files have been created with correct content mapr_fs_files = cluster.mapr_fs.client.list(str(mapr_fs_output_path)) # assert events (MapReduce) generated assert len(mapr_fs_files) == len(product_data) input_file_path = '' for event in wiretap_hadoop.output_records: input_file_path = event.field['filepath'].value # We create the tmp file based on the input file and the tmp_prefix used by the stage tmp_input_file_array = input_file_path.split('/') tmp_input_file_array[4] = TMP_PREFIX + tmp_input_file_array[4] tmp_input_file = '/'.join(tmp_input_file_array) cluster.mapr_fs.client.write(tmp_input_file, 'FILE CONTENTS') builder_2 = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder_2.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data, stop_after_first_batch=True) mapreduce = builder_2.add_stage( 'MapReduce', type='executor').set_attributes(job_type='AVRO_PARQUET', output_directory=mapr_fs_output_path, mapreduce_configuration_directory='mapr', # input file is not an EL this way input_avro_file=input_file_path, overwrite_temporary_file=overwrite_temporary_file) wiretap_mapreduce = builder_2.add_wiretap() dev_raw_data_source >> mapreduce >= wiretap_mapreduce.destination pipeline_2 = builder_2.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline_2) sdc_executor.start_pipeline(pipeline_2).wait_for_finished() # make sure MapReduce job is done and is successful for event in wiretap_mapreduce.output_records: job_id = event.field['job-id'].value job_id = job_id.replace('job', 'application') assert cluster.yarn.wait_for_job_to_end(job_id) == 'FINISHED' # assert parquet data is same as what is ingested for event in wiretap_hadoop.output_records: file_path = event.field['filepath'].value maprfs_parquet_file_path = f'{file_path}.parquet' maprfs_output = io.BytesIO() # with overwrite_temporary_file set to False, there should be no parquet file with cluster.mapr_fs.client.read(maprfs_parquet_file_path) as reader: maprfs_output.write(reader.read()) maprfs_data = [row for row in parquet.DictReader(maprfs_output)] assert maprfs_data[0] in product_data except hdfs.util.HdfsError as e: if not overwrite_temporary_file: # meaning there is no parquet file as pipeline did not do anything assert input_file_path in e.message else: raise e finally: cluster.mapr_fs.client.delete(mapr_fs_output_path, recursive=True)
#!/usr/bin/env python # encoding: utf-8 import json import parquet import glob for filename in glob.glob("/tmp/tmp/*.parquet"): with open(filename) as fo: for row in parquet.DictReader( fo, columns=['px', 'py', 'x', 'y', 'distanceToLine']): print(json.dumps(row))
def _parquet(ctx, files): if not parquet: raise SystemExit("parquet module not found, please install manually") lines = chain(*(parquet.DictReader(x) for x in files)) log('info', 'Loading into ElasticSearch') load(lines, ctx.obj)
# content = fs.read() #s = str(content, 'utf-8') #file = open("D:\data.csv", "w") #file.write(s) #df = pd.read_csv("D:\data.csv", names=COLUMNNAMES) #print df import parquet import json ## assuming parquet file with two rows and three columns: ## foo bar baz ## 1 2 3 ## 4 5 6 with open("test.parquet") as fo: # prints: # {"foo": 1, "bar": 2} # {"foo": 4, "bar": 5} for row in parquet.DictReader(fo, columns=['foo', 'bar']): print(json.dumps(row)) with open("test.parquet") as fo: # prints: # 1,2 # 4,5 for row in parquet.reader(fo, columns=['foo', 'bar]): print(",".join([str(r) for r in row]))