def test_merge(tempdir, dirs, row_groups): fn = str(tempdir) default_mkdirs(os.path.join(fn, dirs[0])) df0 = pd.DataFrame({'a': [1, 2, 3, 4]}) fn0 = os.sep.join([fn, dirs[0], 'out0.parq']) write(fn0, df0, row_group_offsets=row_groups) default_mkdirs(os.path.join(fn, dirs[1])) df1 = pd.DataFrame({'a': [5, 6, 7, 8]}) fn1 = os.sep.join([fn, dirs[1], 'out1.parq']) write(fn1, df1, row_group_offsets=row_groups) # with file-names pf = writer.merge([fn0, fn1]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert 'cat' in pf.cats # with instances pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert 'cat' in pf.cats
def test_merge(tempdir, dirs, row_groups): fn = str(tempdir) os.makedirs(os.path.join(fn, dirs[0]), exist_ok=True) df0 = pd.DataFrame({"a": [1, 2, 3, 4]}) fn0 = os.sep.join([fn, dirs[0], "out0.parq"]) write(fn0, df0, row_group_offsets=row_groups) os.makedirs(os.path.join(fn, dirs[1]), exist_ok=True) df1 = pd.DataFrame({"a": [5, 6, 7, 8]}) fn1 = os.sep.join([fn, dirs[1], "out1.parq"]) write(fn1, df1, row_group_offsets=row_groups) # with file-names pf = writer.merge([fn0, fn1]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert "cat" in pf.cats # with instances pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert "cat" in pf.cats
def test_merge_fail(tempdir): fn = str(tempdir) df0 = pd.DataFrame({'a': [1, 2, 3, 4]}) fn0 = os.sep.join([fn, 'out0.parq']) write(fn0, df0) df1 = pd.DataFrame({'a': ['a', 'b', 'c']}) fn1 = os.sep.join([fn, 'out1.parq']) write(fn1, df1) with pytest.raises(ValueError) as e: writer.merge([fn0, fn1]) assert 'schemas' in str(e.value)
def test_merge_fail(tempdir): fn = str(tempdir) df0 = pd.DataFrame({"a": [1, 2, 3, 4]}) fn0 = os.sep.join([fn, "out0.parq"]) write(fn0, df0) df1 = pd.DataFrame({"a": ["a", "b", "c"]}) fn1 = os.sep.join([fn, "out1.parq"]) write(fn1, df1) with pytest.raises(ValueError) as e: writer.merge([fn0, fn1]) assert "schemas" in str(e) os.remove(fn1) write(fn1, df0, file_scheme="hive") with pytest.raises(ValueError) as e: writer.merge([fn0, fn1]) assert "multi-file" in str(e)
def test_merge_s3(tempdir, s3): fn = str(tempdir) df0 = pd.DataFrame({'a': [1, 2, 3, 4]}) fn0 = TEST_DATA + '/out0.parq' write(fn0, df0, open_with=s3.open) df1 = pd.DataFrame({'a': [5, 6, 7, 8]}) fn1 = TEST_DATA + '/out1.parq' write(fn1, df1, open_with=s3.open) # with file-names pf = writer.merge([fn0, fn1], open_with=s3.open) assert len(pf.row_groups) == 2 out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8]
def test_merge_s3(tempdir, s3): fn = str(tempdir) df0 = pd.DataFrame({"a": [1, 2, 3, 4]}) fn0 = TEST_DATA + "/out0.parq" write(fn0, df0, open_with=s3.open) df1 = pd.DataFrame({"a": [5, 6, 7, 8]}) fn1 = TEST_DATA + "/out1.parq" write(fn1, df1, open_with=s3.open) # with file-names pf = writer.merge([fn0, fn1], open_with=s3.open) assert len(pf.row_groups) == 2 out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8]