示例#1
0
def test_read_single_row_group():
    import pyarrow.parquet as pq

    # ARROW-471
    N, K = 10000, 4
    df = alltypes_sample(size=N)

    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table,
                 buf,
                 row_group_size=N / K,
                 compression='snappy',
                 version='2.0')

    buf.seek(0)

    pf = pq.ParquetFile(buf)

    assert pf.num_row_groups == K

    row_groups = [pf.read_row_group(i) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df, result.to_pandas())
示例#2
0
    def test_extract_parquet(self):
        file = BASE_DIR / 'amazon-reviews-1000.snappy.parquet'
        cell_value = '<td>TSD Airsoft/Paintball Full-Face Mask, Goggle Lens</td>'

        with patch('t4_lambda_shared.preview.get_available_memory') as mem_mock:
            mem_mock.return_value = 1
            with open(file, mode='rb') as parquet:
                body, info = extract_parquet(parquet)
                assert all(bracket in body for bracket in ('<', '>'))
                assert body.count('<') == body.count('>'), \
                    'expected matching HTML tags'
                assert cell_value not in body, 'only expected columns'
                assert 'skipped rows' in info['warnings']

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, as_html=True)
            assert cell_value in body, 'missing expected HTML cell'

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, skip_rows=True)
            assert 'skipped rows' in info['warnings']
            assert cell_value not in body, 'only expected columns'

        with open(file, mode='rb') as parquet:
            body, info = extract_parquet(parquet, as_html=False)
            assert all(bracket not in body for bracket in ('<', '>')), \
                'did not expect HTML'
            parquet_file = pq.ParquetFile(file)
            assert all(
                column in info['schema']['names']
                for column in parquet_file.schema.names
            )
            assert [
                parquet_file.metadata.num_rows, parquet_file.metadata.num_columns
            ] == info['shape'], 'Unexpected number of rows or columns'
示例#3
0
def test_read_common_metadata_files(tmpdir):
    import pyarrow.parquet as pq

    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    }, columns=['index', 'values'])

    base_path = str(tmpdir)
    data_path = pjoin(base_path, 'data.parquet')

    table = pa.Table.from_pandas(df)
    _write_table(table, data_path)

    metadata_path = pjoin(base_path, '_metadata')
    pq.write_metadata(table.schema, metadata_path)

    dataset = pq.ParquetDataset(base_path)
    assert dataset.metadata_path == metadata_path

    pf = pq.ParquetFile(data_path)
    assert dataset.schema.equals(pf.schema)

    # handle list of one directory
    dataset2 = pq.ParquetDataset([base_path])
    assert dataset2.schema.equals(dataset.schema)
示例#4
0
def temp_load(fname='df.gzip'):
    '''loads a Parquet file that was saved using temp_save
	faster than re-loading and re-processing the csv'''
    _table = (pq.ParquetFile(fname).read(use_pandas_metadata=True))
    df = _table.to_pandas(strings_to_categorical=True
                          )  # force strings to categoricals to save memory
    return df
示例#5
0
 def _generate_tables(self, files):
     schema = pa.schema(self.config.features.type
                        ) if self.config.features is not None else None
     if self.config.features is not None and self.config.columns is not None:
         if sorted([field.name
                    for field in schema]) != sorted(self.config.columns):
             raise ValueError(
                 f"Tried to load parquet data with columns '{self.config.columns}' with mismatching features '{self.config.features}'"
             )
     for file_idx, file in enumerate(files):
         with open(file, "rb") as f:
             parquet_file = pq.ParquetFile(f)
             try:
                 for batch_idx, record_batch in enumerate(
                         parquet_file.iter_batches(
                             batch_size=self.config.batch_size,
                             columns=self.config.columns)):
                     pa_table = pa.Table.from_batches([record_batch])
                     if self.config.features is not None:
                         pa_table = pa.Table.from_arrays(
                             [pa_table[field.name] for field in schema],
                             schema=schema)
                     # Uncomment for debugging (will print the Arrow table size and elements)
                     # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                     # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
                     yield f"{file_idx}_{batch_idx}", pa_table
             except ValueError as e:
                 logger.error(
                     f"Failed to read file '{file}' with error {type(e)}: {e}"
                 )
                 raise
示例#6
0
def test_parquet_file_pass_directory_instead_of_file(tempdir):
    # ARROW-7208
    path = tempdir / 'directory'
    os.mkdir(str(path))

    with pytest.raises(IOError, match="Expected file path"):
        pq.ParquetFile(path)
示例#7
0
    def __init__(self,
                 input: InputFile,
                 expected_schema: Schema,
                 options,
                 filter_expr: Expression,
                 case_sensitive: bool,
                 start: int = None,
                 end: int = None):
        self._stats: typing.Dict[str, int] = dict()

        self._input = input
        self._input_fo = input.new_fo()

        self._arrow_file = pq.ParquetFile(self._input_fo)
        self._file_schema = convert_parquet_to_iceberg(self._arrow_file)
        self._expected_schema = expected_schema
        self._file_to_expected_name_map = ParquetReader.get_field_map(
            self._file_schema, self._expected_schema)
        self._options = options
        self._filter = get_dataset_filter(
            filter_expr,
            ParquetReader.get_reverse_field_map(self._file_schema,
                                                self._expected_schema))

        self._case_sensitive = case_sensitive
        if start is not None or end is not None:
            raise NotImplementedError(
                "Partial file reads are not yet supported")
            # self.start = start
            # self.end = end

        self.materialized_table = False
        self._table = None

        _logger.debug("Reader initialized for %s" % self._input.path)
示例#8
0
def main(inputFile, outputFile):
    """Temporary hackarounds from using old pyarrow"""
    parquet_file = pq.ParquetFile(inputFile)
    df = parquet_file.read().to_pandas()

    # Brutally fill nulls with 0
    # It can be interpreted as boolean
    df.fillna(value=0, inplace=True)

    typeMapping = {
        'FLOAT': 'float32',
        'DOUBLE': 'float64',
        'BOOLEAN': 'bool',
        'BIGINT': 'int64',
        'INTEGER': 'int32',
        'TEXT': 'str',
    }

    filepath = os.path.join(getPackageDir("sdm_schemas"), 'yml', 'hsc.yaml')
    with open(filepath, 'r') as f:
        hscSchema = yaml.safe_load(f)['tables']
    objectSchema = [table for table in hscSchema if table['name'] == 'Object']
    hackDict = dict()
    for column in objectSchema[0]['columns']:
        sqlType = column['mysql:datatype']
        # Hack the types for now
        if sqlType in typeMapping:
            sqlType = typeMapping[sqlType]
        hackDict[column['name']] = sqlType

    print(hackDict)
    df = df.astype(hackDict, copy=False)

    table = pyarrow.Table.from_pandas(df)
    pq.write_table(table, outputFile, compression='none')
示例#9
0
    def is_match(cls, file_path, options=None):
        """
        Test the given file to check if the file has valid Parquet format.

        :param file_path: path to the file to be examined
        :type file_path: str
        :param options: parquet read options
        :type options: dict
        :return: is file a parquet file or not
        :rtype: bool
        """
        if options is None:
            options = dict()

        # get current position of stream
        if data_utils.is_stream_buffer(file_path):
            starting_location = file_path.tell()

        try:
            pfile = pq.ParquetFile(file_path)  # NOQA
            is_valid_parquet = True
        except Exception:
            is_valid_parquet = False

        # return to original position in stream
        if data_utils.is_stream_buffer(file_path):
            file_path.seek(starting_location, 0)

        return is_valid_parquet
示例#10
0
def parquet_reader(folder):
    data = []
    for file in os.listdir(folder):
        if os.path.splitext(file)[1] == '.parquet':
            file_path = os.path.join(folder, file)
            chunk = pq.ParquetFile(file_path).read().to_pandas()
            data.append(chunk)
    return pd.concat(data, axis = 0)
示例#11
0
def dump(file_name):
    pqf = pq.ParquetFile(file_name)
    metadata = pqf.metadata
    dump_file(metadata)

    for row_group_index in range(metadata.num_row_groups):
        row_group = metadata.row_group(row_group_index)
        dump_row_group(row_group_index, row_group)
 def __init__(self, filename, label):
     self.parquet = pq.ParquetFile(filename)
     #self.cols = None # read all columns
     #self.cols = ['X_jet.list.item.list.item.list.item','am','apt','iphi','ieta']
     self.cols = [
         'X_jet.list.item.list.item.list.item', 'am', 'iphi', 'ieta'
     ]
     self.label = label
示例#13
0
 def __init__(self, test_data_path, host):
     # read the test data(label, features) stored as a parquet file object
     self.parquet_file = pq.ParquetFile(test_data_path)
     print('-----------------------')
     # creating the Kafka producer object
     self.producer = KafkaProducer(bootstrap_servers=[host],
                                   value_serializer=lambda x:
                                   dumps(x).encode('utf-8'))
示例#14
0
def test_it_generates_new_file_without_matches(mock_delete):
    # Arrange
    column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]}
    data = [{'customer_id': '12345'}, {'customer_id': '23456'}]
    df = pd.DataFrame(data)
    buf = BytesIO()
    df.to_parquet(buf)
    br = pa.BufferReader(buf.getvalue())
    f = pq.ParquetFile(br, memory_map=False)
    mock_delete.return_value = pd.DataFrame([{'customer_id': '12345'}])
    # Act
    out, stats = delete_matches_from_file(f, [column])
    assert isinstance(out, pa.BufferOutputStream)
    assert {"ProcessedRows": 2, "DeletedRows": 1} == stats
    res = pa.BufferReader(out.getvalue())
    newf = pq.ParquetFile(res, memory_map=False)
    assert 1 == newf.read().num_rows
 def __init__(self, filename, label):
     self.parquet = pq.ParquetFile(filename)
     #self.cols = None # read all columns
     self.cols = [
         'Xtz_aod.list.item.list.item.list.item', 'm', 'pt', 'w', 'iphi',
         'ieta'
     ]
     self.label = label
示例#16
0
def get_parquet_schema(infile):
    """ Get schema from parquet file"""

    try:
        parquet_schema = parquet.ParquetFile(infile)
        return parquet_schema.schema
    except BaseException as e:
        raise e
示例#17
0
def make_sample_file(df):
    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    return pq.ParquetFile(buf)
示例#18
0
def test_pass_separate_metadata():
    # ARROW-471
    df = alltypes_sample(size=10000)

    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, compression='snappy', version='2.0')

    buf.seek(0)
    metadata = pq.ParquetFile(buf).metadata

    buf.seek(0)

    fileh = pq.ParquetFile(buf, metadata=metadata)

    pdt.assert_frame_equal(df, fileh.read().to_pandas())
示例#19
0
def read_groups(path, groups):
    pf = pq.ParquetFile(path)
    #read row groups to pyarrow.Table and convert to pandas DataFrame
    df = pf.read_row_groups(row_groups=groups).to_pandas()

    #cleanup parquet file
    del pf
    return df
示例#20
0
def read_parquet():

    pf = pq.ParquetFile(TICKERS_DIR + '/ticker.parquet')

    for i in range(pf.metadata.num_row_groups):
        table = pf.read_row_group(i)
        columns = table.to_pydict()
        print(columns)
示例#21
0
def test_pyarrow_compression():
    table = pv.read_csv("./data/people/people1.csv")
    pq.write_table(table, "./tmp/pyarrow_out/people1.parquet")
    parquet_file = pq.ParquetFile("./tmp/pyarrow_out/people1.parquet")
    # print(parquet_file.metadata)
    # print(parquet_file.metadata.row_group(0))
    # print(parquet_file.metadata.row_group(0).column(0))
    # print(parquet_file.metadata.row_group(0).column(0).statistics)
    assert parquet_file.metadata.row_group(0).column(0).compression == "SNAPPY"
示例#22
0
 def __init__(self, filename=None, dataFrame=None):
     if filename is not None:
         self._pf = pq.ParquetFile(filename)
         self._df = None
     elif dataFrame is not None:
         self._df = dataFrame
         self._pf = None
     else:
         raise ValueError('Either filename or dataFrame must be passed.')
def get_file_map(args):
    r = re.compile(args.parquet_genotype_pattern)
    files = os.listdir(args.parquet_genotype_folder)
    files = {int(r.search(f).groups()[0]):os.path.join(args.parquet_genotype_folder, f) for f in files if r.search(f)}
    p = {}
    for k,v in files.items():
        g = pq.ParquetFile(v)
        p[k] = g
    return p
示例#24
0
 def read_group_to_pandas(self,
                          f,
                          group_index,
                          columns=None,
                          use_arrow_dtype=None,
                          **kwargs):
     file = pq.ParquetFile(f)
     t = file.read_row_group(group_index, columns=columns, **kwargs)
     return self._table_to_pandas(t, use_arrow_dtype=use_arrow_dtype)
示例#25
0
    def _init_reader(self, file: Union[TextIO, BinaryIO]) -> ParquetFile:
        """Generates a new parquet reader
        Doc: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html

        """
        options = self._select_options("buffer_size")  # type: ignore[arg-type]
        # Source is a file path and enabling memory_map can improve performance in some environments.
        options["memory_map"] = True
        return pq.ParquetFile(file, **options)
示例#26
0
    def __next__(self) -> pd.DataFrame:
        pq_reader = pq.ParquetFile(self.path)

        if self._current_row_group == pq_reader.num_row_groups:
            raise StopIteration

        table = pq_reader.read_row_group(self._current_row_group)
        self._current_row_group += 1
        return table.to_pandas()
示例#27
0
    def tile(cls, op):
        chunk_index = 0
        out_chunks = []
        out_df = op.outputs[0]

        dtypes = out_df.dtypes
        if op.use_arrow_dtype is None and not op.gpu and \
                options.dataframe.use_arrow_dtype:  # pragma: no cover
            # check if use_arrow_dtype set on the server side
            dtypes = to_arrow_dtypes(out_df.dtypes)

        shape = (np.nan, out_df.shape[1])

        paths = op.path if isinstance(op.path, (tuple, list)) else \
            glob(op.path, storage_options=op.storage_options)

        for pth in paths:
            if op.groups_as_chunks:
                for group_idx in range(pq.ParquetFile(pth).num_row_groups):
                    chunk_op = op.copy().reset_key()
                    chunk_op._path = pth
                    chunk_op._group_index = group_idx
                    new_chunk = chunk_op.new_chunk(
                        None,
                        shape=shape,
                        index=(chunk_index, 0),
                        index_value=out_df.index_value,
                        columns_value=out_df.columns_value,
                        dtypes=dtypes)
                    out_chunks.append(new_chunk)
                    chunk_index += 1
            else:
                chunk_op = op.copy().reset_key()
                chunk_op._path = pth
                new_chunk = chunk_op.new_chunk(
                    None,
                    shape=shape,
                    index=(chunk_index, 0),
                    index_value=out_df.index_value,
                    columns_value=out_df.columns_value,
                    dtypes=dtypes)
                out_chunks.append(new_chunk)
                chunk_index += 1

        if op.incremental_index:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_df.shape[1], ))
        return new_op.new_dataframes(None,
                                     out_df.shape,
                                     dtypes=dtypes,
                                     index_value=out_df.index_value,
                                     columns_value=out_df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
示例#28
0
    def _tile_no_partitioned(cls, op: "DataFrameReadParquet"):
        chunk_index = 0
        out_chunks = []
        out_df = op.outputs[0]

        dtypes = cls._to_arrow_dtypes(out_df.dtypes, op)
        shape = (np.nan, out_df.shape[1])

        paths = op.path if isinstance(op.path, (tuple, list)) else \
            glob(op.path, storage_options=op.storage_options)

        first_chunk_row_num, first_chunk_raw_bytes = None, None
        for i, pth in enumerate(paths):
            if i == 0:
                with open_file(pth, storage_options=op.storage_options) as f:
                    first_chunk_row_num = get_engine(op.engine).get_row_num(f)
                first_chunk_raw_bytes = file_size(pth, storage_options=op.storage_options)

            if op.groups_as_chunks:
                num_row_groups = pq.ParquetFile(pth).num_row_groups
                for group_idx in range(num_row_groups):
                    chunk_op = op.copy().reset_key()
                    chunk_op._path = pth
                    chunk_op._group_index = group_idx
                    chunk_op._first_chunk_row_num = first_chunk_row_num
                    chunk_op._first_chunk_raw_bytes = first_chunk_raw_bytes
                    chunk_op._num_group_rows = num_row_groups
                    new_chunk = chunk_op.new_chunk(
                        None, shape=shape, index=(chunk_index, 0),
                        index_value=out_df.index_value,
                        columns_value=out_df.columns_value,
                        dtypes=dtypes)
                    out_chunks.append(new_chunk)
                    chunk_index += 1
            else:
                chunk_op = op.copy().reset_key()
                chunk_op._path = pth
                chunk_op._first_chunk_row_num = first_chunk_row_num
                chunk_op._first_chunk_raw_bytes = first_chunk_raw_bytes
                new_chunk = chunk_op.new_chunk(
                    None, shape=shape, index=(chunk_index, 0),
                    index_value=out_df.index_value,
                    columns_value=out_df.columns_value,
                    dtypes=dtypes)
                out_chunks.append(new_chunk)
                chunk_index += 1

        if op.incremental_index:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],))
        return new_op.new_dataframes(None, out_df.shape, dtypes=dtypes,
                                     index_value=out_df.index_value,
                                     columns_value=out_df.columns_value,
                                     chunks=out_chunks, nsplits=nsplits)
示例#29
0
def read_parquet_metadata(path):
    """{docstring}"""

    pq_file = pq.ParquetFile(path)

    num_rows = pq_file.metadata.num_rows
    num_row_groups = pq_file.num_row_groups
    col_names = pq_file.schema.names

    return num_rows, num_row_groups, col_names
示例#30
0
        def write_commonmetadata_file():
            with filesystem.open(os.path.join(path, "part.0.parquet")) as f:
                pf = pq.ParquetFile(f)

            all_metadata = copy.copy(pf.metadata.metadata)
            all_metadata[b'spatialpandas'] = b_spatial_metadata

            new_schema = pf.schema.to_arrow_schema().with_metadata(all_metadata)
            with filesystem.open(os.path.join(path, "_common_metadata"), 'wb') as f:
                pq.write_metadata(new_schema, f)