def test_read_footer(self): footer = parquet.read_footer(self.f) self.assertEquals( set([s.name for s in footer.schema]), set([ "schema", "n_regionkey", "n_name", "n_nationkey", "n_comment" ]))
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, "rb") as f: expected_data = list(csv.reader(f, delimiter="|")) def _custom_datatype(in_dict, keys): """ return rows like the csv outputter Could convert to a dataframe like this: import pandas df = pandas.DataFrame(in_dict) return df """ columns = [in_dict[key] for key in keys] rows = zip(*columns) return rows actual_data = parquet.dump(parquet_file, Options(format="custom"), out=_custom_datatype) assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def _test_file_json(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) actual_raw_data = StringIO.StringIO() parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = [ json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n") if len(x) > 0 ] assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_raw_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def _test_file_custom(self, parquet_file, csv_file): """Test the DictReader function against csv data. Given the parquet_file and csv_file representation, reads the parquet file using DictReader and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with io.open(csv_file, 'r', encoding="utf-8") as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_data = [] with open(parquet_file, "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.tc.assertEquals(len(expected_data), len(actual_data)) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): self.tc.assertEquals(len(expected), len(actual)) for i, c in enumerate([c for c in cols if c in actual]): self.tc.assertEquals( expected[i], actual[c].decode('utf-8') if type(actual[c]) is bytes \ # this makes '0' = 0, since csv reads all strings. else str(actual[c]))
def test_read_footer(self): """Test reading the footer.""" footer = parquet.read_footer(TEST_FILE) self.assertEquals( set([s.name for s in footer.schema]), set(["schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"]))
def _test_file_json(self, parquet_file, csv_file): """Test the dump function by outputting to a json file. Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with io.open(csv_file, 'r', encoding='utf-8') as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_raw_data = io.StringIO() parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data) actual_raw_data.seek(0, 0) actual_data = [json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n") if len(x) > 0] assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_raw_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with io.open(csv_file, 'r', encoding="utf-8") as f: expected_data = list(csv.reader(f, delimiter=PIPE_DELIM)) actual_data = [] with open(parquet_file, "rb") as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.tc.assertEquals(len(expected_data), len(actual_data)) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): self.tc.assertEquals(len(expected), len(actual)) for i, c in enumerate([c for c in cols if c in actual]): self.tc.assertEquals(expected[i], actual[c].decode('utf-8') if type(actual[c]) is bytes \ # this makes '0' = 0, since csv reads all strings. else str(actual[c]))
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) def _custom_datatype(in_dict, keys): ''' return rows like the csv outputter Could convert to a dataframe like this: import pandas df = pandas.DataFrame(in_dict) return df ''' columns = [in_dict[key] for key in keys] rows = zip(*columns) return rows actual_data = parquet.dump(parquet_file, Options(format='custom'), out=_custom_datatype) assert len(expected_data) == len(actual_data) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): assert len(expected) == len(actual) for i, c in enumerate(cols): if c in actual: assert expected[i] == actual[c]
def test_read_footer(self): """Test reading the footer.""" footer = parquet.read_footer(TEST_FILE) self.assertEquals( set([s.name for s in footer.schema]), set([ "schema", "n_regionkey", "n_name", "n_nationkey", "n_comment" ]))
def _test_file_custom(self, parquet_file, csv_file): """ Given the parquet_file and csv_file representation, converts the parquet_file to json using the dump utility and then compares the result to the csv_file using column agnostic ordering. """ expected_data = [] with open(csv_file, 'rb') as f: expected_data = list(csv.reader(f, delimiter='|')) actual_data = [] with open(parquet_file) as parquet_fo: actual_data = list(parquet.DictReader(parquet_fo)) self.tc.assertEquals(len(expected_data), len(actual_data)) footer = parquet.read_footer(parquet_file) cols = [s.name for s in footer.schema] for expected, actual in zip(expected_data, actual_data): self.tc.assertEquals(len(expected), len(actual)) for i, c in enumerate([c for c in cols if c in actual]): self.tc.assertEquals(expected[i], str(actual[c]))
def test_read_footer(self): footer = parquet.read_footer(self.f) self.assertEquals( set([s.name for s in footer.schema]), set(["schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"]) )