def _test_file_json(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) actual_raw_data = StringIO.StringIO() parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = [ json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n") if len(x) > 0 ] assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_raw_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def _test_file_csv(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to a csv using the dump utility and then compares the result to the csv_file. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) actual_raw_data = StringIO.StringIO() parquet.dump(parquet_file, Options(), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter='\t')) assert expected_data == actual_data, "{0} != {1}".format( str(expected_data), str(actual_data)) actual_raw_data = StringIO.StringIO() parquet.dump(parquet_file, Options(no_headers=False), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))[1:] assert expected_data == actual_data, "{0} != {1}".format( str(expected_data), str(actual_data))
def _test_file_csv(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to a csv using the dump utility and then compares the result to the csv_file. """ expected_data = [] with io.open(csv_file, 'r', encoding="utf-8") as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM)) #assert expected_data == actual_data, "{0} != {1}".format( # str(expected_data), str(actual_data)) self.tc.assertListEqual(expected_data, actual_data) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(no_headers=False), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM))[1:] self.tc.assertListEqual(expected_data, actual_data)
def _test_file_json(self, parquet_file, csv_file): """Test the dump function by outputting to a json file. Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with io.open(csv_file, 'r', encoding='utf-8') as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = [json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n") if len(x) > 0] assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_raw_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def read_parquet_data (filename): actual_raw_data = StringIO.StringIO() parquet.dump(filename, Options(format='csv'), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter='\t')) return actual_data
def test_limit(self): """Test the limit option.""" limit = 2 expected_data = [] with io.open(CSV_FILE, 'r', encoding="utf-8") as fo: expected_data = list(csv.reader(fo, delimiter='|'))[:limit] actual_raw_data = io.StringIO() parquet.dump(TEST_FILE, Options(limit=limit), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter='\t')) self.assertListEqual(expected_data, actual_data)
def test_limit(self): """Test the limit option""" limit = 2 expected_data = [] with io.open(CSV_FILE, 'r', encoding="utf-8") as fo: expected_data = list(csv.reader(fo, delimiter='|'))[:limit] actual_raw_data = io.StringIO() parquet.dump(TEST_FILE, Options(limit=limit), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter='\t')) self.assertListEqual(expected_data, actual_data)
def main(argv=None): """Run parquet utility application.""" argv = argv or sys.argv[1:] parser = argparse.ArgumentParser('parquet', description='Read parquet files') parser.add_argument('--metadata', action='store_true', help='show metadata on file') parser.add_argument('--row-group-metadata', action='store_true', help="show per row group metadata") parser.add_argument('--no-data', action='store_true', help="don't dump any data from the file") parser.add_argument('--limit', action='store', type=int, default=-1, help='max records to output') parser.add_argument('--col', action='append', type=str, help='only include this column (can be ' 'specified multiple times)') parser.add_argument('--no-headers', action='store_true', help='skip headers in output (only applies if ' 'format=csv)') parser.add_argument('--format', action='store', type=str, default='csv', help='format for the output data. can be csv or json.') parser.add_argument('--debug', action='store_true', help='log debug info to stderr') parser.add_argument('file', help='path to the file to parse') args = parser.parse_args(argv) setup_logging(args) # pylint: disable=import-outside-toplevel import parquet if args.metadata: parquet.dump_metadata(args.file, args.row_group_metadata) if not args.no_data: parquet.dump(args.file, args)
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, "rb") as f: expected_data = list(csv.reader(f, delimiter="|")) def _custom_datatype(in_dict, keys): """ return rows like the csv outputter Could convert to a dataframe like this: import pandas df = pandas.DataFrame(in_dict) return df """ columns = [in_dict[key] for key in keys] rows = zip(*columns) return rows actual_data = parquet.dump(parquet_file, Options(format="custom"), out=_custom_datatype) assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) def _custom_datatype(in_dict, keys): ''' return rows like the csv outputter Could convert to a dataframe like this: import pandas df = pandas.DataFrame(in_dict) return df ''' columns = [in_dict[key] for key in keys] rows = zip(*columns) return rows actual_data = parquet.dump(parquet_file, Options(format='custom'), out=_custom_datatype) assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def import_from_parquet(filename, encoding='utf-8', *args, **kwargs): 'Import data from a Parquet file' # TODO: should be able to used fobj also data, field_names = parquet.dump(filename, OPTIONS, _callback) length = len(data[field_names[0]]) table_rows = [[data[field_name][index] for field_name in field_names] for index in range(length)] meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def main(argv=None): """Run parquet utility application.""" argv = argv or sys.argv[1:] parser = argparse.ArgumentParser('parquet', description='Read parquet files') parser.add_argument('--metadata', action='store_true', help='show metadata on file') parser.add_argument('--row-group-metadata', action='store_true', help="show per row group metadata") parser.add_argument('--no-data', action='store_true', help="don't dump any data from the file") parser.add_argument('--limit', action='store', type=int, default=-1, help='max records to output') parser.add_argument('--col', action='append', type=str, help='only include this column (can be ' 'specified multiple times)') parser.add_argument('--no-headers', action='store_true', help='skip headers in output (only applies if ' 'format=csv)') parser.add_argument('--format', action='store', type=str, default='csv', help='format for the output data. can be csv or json.') parser.add_argument('--debug', action='store_true', help='log debug info to stderr') parser.add_argument('file', help='path to the file to parse') args = parser.parse_args(argv) setup_logging(args) import parquet if args.metadata: parquet.dump_metadata(args.file, args.row_group_metadata) if not args.no_data: parquet.dump(args.file, args)
def _test_file_csv(self, parquet_file, csv_file): """Test the dump function by outputting to a csv file. Given the parquet_file and csv_file representation, converts the parquet_file to a csv using the dump utility and then compares the result to the csv_file. """ expected_data = [] with io.open(csv_file, 'r', encoding="utf-8") as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM)) self.tc.assertListEqual(expected_data, actual_data) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(no_headers=False), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM))[1:] self.tc.assertListEqual(expected_data, actual_data)