Exemplo n.º 1
0
def test_pandas_parquet_configuration_options(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)

    for use_dictionary in [True, False]:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       use_dictionary=use_dictionary)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)

    for compression in ['NONE', 'SNAPPY', 'GZIP']:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       compression=compression)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)
Exemplo n.º 2
0
def test_read_non_existent_file(tmpdir):
    import pyarrow.parquet as pq

    path = 'non-existent-file.parquet'
    try:
        pq.read_table(path)
    except Exception as e:
        assert path in e.args[0]
Exemplo n.º 3
0
def test_creation(parquet):
    # we have existing files in our dir

    d = parquet.client.root
    assert len(list(d.iterdir())) == 1

    pqd = d / 'pq'
    assert len(list(pqd.iterdir())) == 2

    assert len(pq.read_table(str(pqd / 'open.parquet'))) == 50
    assert len(pq.read_table(str(pqd / 'close.parquet'))) == 50
Exemplo n.º 4
0
def test_multithreaded_read():
    df = alltypes_sample(size=10000)

    table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    table1 = pq.read_table(buf, nthreads=4)

    buf.seek(0)
    table2 = pq.read_table(buf, nthreads=1)

    assert table1.equals(table2)
Exemplo n.º 5
0
def test_pandas_parquet_2_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        # Pandas only support ns resolution, Arrow at the moment only ms
        'datetime': np.arange("2016-01-01T00:00:00.001", size,
                              dtype='datetime64[ms]'),
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
    A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 6
0
def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 7
0
def test_column_of_lists(tmpdir):
    df, schema = dataframe_with_arrays()

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 8
0
def test_pandas_parquet_2_0_rountrip(tmpdir):
    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 9
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 10
0
def read_parquet(pqfile=os.path.join(_mydir, 'parquet/nrows=all'), method='spark'):
    if method == 'spark':
        from pyspark.sql import SparkSession
        spark = SparkSession.builder.getOrCreate()
        return spark.read.parquet(pqfile)
    elif method == 'pyarrow':
        return pq.read_table(pqfile)
    else:
        raise Exception('bad method {}'.format(method))
Exemplo n.º 11
0
def test_parquet_nested_convenience(tmpdir):
    # ARROW-1684
    import pyarrow.parquet as pq

    df = pd.DataFrame({
        'a': [[1, 2, 3], None, [4, 5], []],
        'b': [[1.], None, None, [6., 7.]],
    })

    path = str(tmpdir / 'nested_convenience.parquet')

    table = pa.Table.from_pandas(df, preserve_index=False)
    _write_table(table, path)

    read = pq.read_table(path, columns=['a'])
    tm.assert_frame_equal(read.to_pandas(), df[['a']])

    read = pq.read_table(path, columns=['a', 'b'])
    tm.assert_frame_equal(read.to_pandas(), df)
Exemplo n.º 12
0
def test_to_parquet_default_writes_nulls(tmpdir):
    check_fastparquet()
    check_pyarrow()
    fn = str(tmpdir.join('test.parquet'))

    df = pd.DataFrame({'c1': [1., np.nan, 2, np.nan, 3]})
    ddf = dd.from_pandas(df, npartitions=1)

    ddf.to_parquet(fn)
    table = pq.read_table(fn)
    assert table[1].null_count == 2
Exemplo n.º 13
0
def load_raw():
    # note manually removed some bad row
    kwargs = get_pandas_read_csv_defaults()
    kwargs['thousands'] = ',' # always do this
    kwargs['parse_dates'] = ['Date']
    kwargs['na_values'] = ['-']
    kwargs['dtype'] = 'str'
    dtype = {
     'Close': 'float',
     'High': 'float',
     'Low': 'float',
     'Market Cap': 'float',
     'Open': 'float',
     'Volume': 'float'
     }

    meta = pd.read_csv(os.path.join(_mydir, 'Top100Cryptos/data/100 List.csv'))
    names = meta.Name.tolist()
    files = [os.path.join(_mydir, 'Top100Cryptos/data/{}.csv'.format(x)) for x in names]
    # files = glob.glob(os.path.join(_mydir, 'Top100Cryptos/data/*.csv'))
    dfs = list()
    datadir = os.path.join(_mydir, 'parsed')
    if not os.path.exists(datadir):
        os.makedirs(datadir)
    for i, (name, f) in enumerate(zip(names, files)):
        mtime = os.path.getmtime(f)
        dirname = os.path.join(datadir, 'name={}/mtime={}'.format(name, mtime))
        filename = os.path.join(dirname, 'data.parquet')
        if not os.path.exists(filename):
            df = pd.read_csv(f, **kwargs)
            df = pa.Table.from_pandas(df)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            print('writing {}'.format(filename))
            pq.write_table(df, filename)
            pq.read_table('./parsed') # test
        else:
            print('{} exists'.format(filename))
    return pq.read_table('./parsed') # test
Exemplo n.º 14
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        result = pq.read_table(path)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
Exemplo n.º 15
0
def test_min_chunksize():
    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
    table = pa.Table.from_pandas(data.reset_index())

    buf = io.BytesIO()
    pq.write_table(table, buf, chunk_size=-1)

    buf.seek(0)
    result = pq.read_table(buf)

    assert result.equals(table)

    with pytest.raises(ValueError):
        pq.write_table(table, buf, chunk_size=0)
Exemplo n.º 16
0
def read_parquet(fn):
  """ read parquet file with Spark """
  print("Loading parquest file: %s..."% fn)
  file_name = 'parquet_sample.dat'
  read_parquest(file_name)
  fn = 'sample.parquet'
  tbl = pq.read_table(fn)
  df = tbl.to_pandas()
  d=df.iloc[:, 0:3]

  table = pa.Table.from_pandas(d)
  pq.write_table(table, 'example.parquet')

  pass
Exemplo n.º 17
0
def test_pandas_column_selection(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath)
    table_read = pq.read_table(filename.strpath, columns=['uint8'])
    df_read = table_read.to_pandas()

    pdt.assert_frame_equal(df[['uint8']], df_read)
Exemplo n.º 18
0
def test_single_pylist_column_roundtrip(tmpdir):
    for dtype in [int, float]:
        filename = tmpdir.join('single_{}_column.parquet'
                               .format(dtype.__name__))
        data = [A.from_pylist(list(map(dtype, range(5))))]
        table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
        A.parquet.write_table(table, filename.strpath)
        table_read = pq.read_table(filename.strpath)
        for col_written, col_read in zip(table.itercolumns(),
                                         table_read.itercolumns()):
            assert col_written.name == col_read.name
            assert col_read.data.num_chunks == 1
            data_written = col_written.data.chunk(0)
            data_read = col_read.data.chunk(0)
            assert data_written.equals(data_read)
Exemplo n.º 19
0
def write_parquet_table_as_partitioned_dataset(parquet_file) -> pq.ParquetDataset:
    """ Write a parquet table as a parititioned dataset (i.e. multiple Parquet files)
    An example of a dataset partitioned by year and month on disk might look like:
        dataset_name/
            year=2018/
                month=09/
                    0.parq
                    1.parq
                month=10/
                    0.parq
                    1.parq
    """
    parquet_table = pq.read_table(parquet_file)  # Read back Parquet File as a Table
    #pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['created'])
    pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['year', 'month', 'day'], flavor='spark')
    dataset = pq.ParquetDataset('starships')
    return dataset
Exemplo n.º 20
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path)

        result = pq.read_table(path).to_pandas()

        pdt.assert_frame_equal(result, df)
Exemplo n.º 21
0
def test_pandas_parquet_pyfile_roundtrip(tmpdir):
    filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath
    size = 5
    df = pd.DataFrame({
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': ['foo', 'bar', None, 'baz', 'qux']
    })

    arrow_table = A.from_pandas_dataframe(df)

    with open(filename, 'wb') as f:
        A.parquet.write_table(arrow_table, f, version="1.0")

    data = io.BytesIO(open(filename, 'rb').read())

    table_read = pq.read_table(data)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Exemplo n.º 22
0
def _read_map_parquet(healsparse_class, filepath, pixels=None, header=False,
                      degrade_nside=None, weightfile=None, reduction='mean',
                      use_threads=False):
    """
    Internal function to read in a HealSparseMap from a parquet dataset.

    Parameters
    ----------
    healsparse_class : `type`
        Type value of the HealSparseMap class.
    filepath : `str`
        Name of the file path to read.  Must be a parquet dataset.
    pixels : `list`, optional
        List of coverage map pixels to read.
    header : `bool`, optional
        Return the parquet metadata as well as map?  Default is False.
    degrade_nside : `int`, optional
        Degrade map to this nside on read.  None means leave as-is.
        Not yet implemented for parquet.
    weightfile : `str`, optional
        Floating-point map to supply weights for degrade wmean.  Must
        be a HealSparseMap (weighted degrade not supported for
        healpix degrade-on-read).
        Not yet implemented for parquet.
    reduction : `str`, optional
        Reduction method with degrade-on-read.
        (mean, median, std, max, min, and, or, sum, prod, wmean).
        Not yet implemented for parquet.
    use_threads : `bool`, optional
        Use multithreaded reading.

    Returns
    -------
    healSparseMap : `HealSparseMap`
        HealSparseMap from file, covered by pixels
    header : `astropy.io.fits.Header` (if header=True)
        Header metadata for the map file.
    """
    ds = dataset.dataset(filepath, format='parquet', partitioning='hive')
    schema = ds.schema
    # Convert from byte strings
    md = {key.decode(): schema.metadata[key].decode()
          for key in schema.metadata}

    if 'healsparse::filetype' not in md:
        raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath))
    if md['healsparse::filetype'] != 'healsparse':
        raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath))
    cov_fname = os.path.join(filepath, '_coverage.parquet')
    if not os.path.isfile(cov_fname):
        # Note that this could be reconstructed from the information in the file
        # inefficiently.  This feature could be added in the future.
        raise RuntimeError("Filepath %s is missing coverage map %s" % (filepath, cov_fname))

    nside_sparse = int(md['healsparse::nside_sparse'])
    nside_coverage = int(md['healsparse::nside_coverage'])
    nside_io = int(md['healsparse::nside_io'])
    bitshift_io = _compute_bitshift(nside_io, nside_coverage)

    cov_tab = parquet.read_table(cov_fname, use_threads=use_threads)
    cov_pixels = cov_tab['cov_pix'].to_numpy()
    row_groups = cov_tab['row_group'].to_numpy()

    if pixels is not None:
        _pixels = np.atleast_1d(pixels)
        if len(np.unique(_pixels)) < len(_pixels):
            raise RuntimeError("Input list of pixels must be unique.")

        sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0, cov_pixels.size - 1)
        ok, = np.where(cov_pixels[sub] == _pixels)
        if ok.size == 0:
            raise RuntimeError("None of the specified pixels are in the coverage map.")
        _pixels = np.sort(_pixels[ok])

        _pixels_io = np.right_shift(_pixels, bitshift_io)

        # Figure out row groups...
        matches = np.searchsorted(cov_pixels, _pixels)
        _row_groups_io = row_groups[matches]
    else:
        _pixels = cov_pixels
        _pixels_io = None
        _row_groups_io = None

    cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse, _pixels)

    if md['healsparse::widemask'] == 'True':
        is_wide_mask = True
        wmult = int(md['healsparse::wwidth'])
    else:
        is_wide_mask = False
        wmult = 1

    if md['healsparse::primary'] != '':
        # This is a multi-column table.
        is_rec_array = True
        primary = md['healsparse::primary']
        columns = [name for name in schema.names if name not in ['iopix', 'cov_pix']]
        dtype = [(name, schema.field(name).type.to_pandas_dtype()) for
                 name in columns]
        primary_dtype = schema.field(primary).type.to_pandas_dtype()
    else:
        is_rec_array = False
        primary = None
        dtype = schema.field('sparse').type.to_pandas_dtype()
        primary_dtype = dtype
        columns = ['sparse']

    if md['healsparse::sentinel'] == 'UNSEEN':
        sentinel = primary_dtype(hp.UNSEEN)
    else:
        sentinel = primary_dtype(md['healsparse::sentinel'])

        if is_integer_value(sentinel):
            sentinel = int(sentinel)
        else:
            sentinel = float(sentinel)

    if is_rec_array:
        sparse_map = np.zeros((_pixels.size + 1)*cov_map.nfine_per_cov, dtype=dtype)
        # Fill in the overflow (primary)
        sparse_map[primary][: cov_map.nfine_per_cov] = sentinel
        # Fill in the overflow (not primary)
        for d in dtype:
            if d[0] == primary:
                continue
            sparse_map[d[0]][: cov_map.nfine_per_cov] = check_sentinel(d[1], None)
    else:
        sparse_map = np.zeros((_pixels.size + 1)*cov_map.nfine_per_cov*wmult, dtype=dtype)
        sparse_map[: cov_map.nfine_per_cov*wmult] = sentinel

    if _pixels_io is None:
        # Read the full table
        tab = ds.to_table(columns=columns, use_threads=use_threads)
    else:
        _pixels_io_unique = list(np.unique(_pixels_io))

        fragments = list(ds.get_fragments(filter=dataset.field('iopix').isin(_pixels_io_unique)))
        group_fragments = []
        for pixel_io, fragment in zip(_pixels_io_unique, fragments):
            groups = fragment.split_by_row_group()
            # Only append groups that are relevant
            use, = np.where(_pixels_io == pixel_io)
            for ind in use:
                group_fragments.append(groups[_row_groups_io[ind]])

        ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format)
        tab = ds2.to_table(columns=columns, use_threads=use_threads)

    if is_rec_array:
        for name in columns:
            sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy()
    else:
        sparse_map[cov_map.nfine_per_cov*wmult:] = tab['sparse'].to_numpy()

        if is_wide_mask:
            sparse_map = sparse_map.reshape((sparse_map.size // wmult,
                                             wmult)).astype(WIDE_MASK)

    healsparse_map = healsparse_class(cov_map=cov_map, sparse_map=sparse_map,
                                      nside_sparse=nside_sparse, primary=primary,
                                      sentinel=sentinel)

    if header:
        if 'healsparse::header' in md:
            hdr_string = md['healsparse::header']
            hdr = fits.Header.fromstring(hdr_string)
        else:
            hdr = fits.Header()

        return (healsparse_map, hdr)
    else:
        return healsparse_map
Exemplo n.º 23
0
# If you have any questions, suggestions, or comments on this example,
# please use the HDF-EOS Forum (http://hdfeos.org/forums).
#
# If you would like to see an example of any other NASA HDF/HDF-EOS data
# product, feel free to contact us at [email protected] or
# post it at the HDF-EOS Forum (http://hdfeos.org/forums).
#
# This script was tested on Mac OS X Mavericks machine with the latest
# parquet and arrow compiled from GitHub repository.
#
# Last tested: 9/22/2016
# Author: Hyo-Kyung Lee
import pyarrow as A
import pyarrow.parquet as pq
import pandas as pd
import h5py

FILE_NAME='/tmp/GSSTF_NCEP.3.1987.07.01.he5'
with h5py.File(FILE_NAME, mode='r') as f:
    dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST']
    values = dset_var[0,:]
data = {}
data['i4'] = values.astype('i4')
filename='GSSTF.parquet'
df=pd.DataFrame(data)
arrow_table = A.from_pandas_dataframe(df)
A.parquet.write_table(arrow_table, filename, version="2.0")
table_read = pq.read_table(filename)
df_read = table_read.to_pandas()
print(df_read)
Exemplo n.º 24
0
def test_read_non_existing_file(use_legacy_dataset):
    # ensure we have a proper error message
    with pytest.raises(FileNotFoundError):
        pq.read_table('i-am-not-existing.parquet')
Exemplo n.º 25
0
def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
    with pytest.warns(None) as record:
        pq.read_table(datadir / 'v0.7.1.parquet',
                      use_legacy_dataset=use_legacy_dataset)

    assert len(record) == 0
Exemplo n.º 26
0
    def load_data(self,
                  data_name='train',
                  batch_size=4096,
                  _get_features=False,
                  feed_mode='batch'):
        """Interface to load training, test or prediction data, called from
        model_manager.

        # Arguments
            data_name: phase of model to feed data, can be 'train', 'test' or
              'validation', or 'prediction'. Using two initials of them also
              works, such as 'tr' for 'train'.
            batch_size: number of data in each batch, only important for
              training phase.
            _get_features: get number of features.
            feed_mode: can be 'batch' which load all the data only once and
              read them by batch to avoid RAM insufficiency, 'generator' which
              load data by samll batch and repeat infinitely over the whole
              data, or 'all' which load all the data into memory.

        # Return
            Compiled Keras model
        """

        self.batch_size = batch_size

        if self.data_format == 'parquet':

            if _get_features:
                if len(self.trains) > 0:
                    p = self.trains[0]
                else:
                    p = self.tests[0]
                tmp = pq.read_table(p).to_pandas()
                df_sample = self.dataframe_process(self.args, tmp)
                self.features = df_sample.columns.values[:-1]
                self.nb_features = self.features.shape[0]
                return

            # train dataset
            if 'tr' in data_name:
                if self.online_learning:
                    self.trains.sort()
                ps = self.trains
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('train')
                    print('Loading train data from %d parquet files' % len(ps))
                    df = self._read_all_data(ps)
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x, y = data[:, :-1], data[:, -1:]
                    x, y = DataLoader._resample_data(self.resample, x, y)
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('train', (x, y))
                else:
                    return self._parquet_read_generator(
                        ps, feed_mode, batch_size, self.resample)

            # validation dataset
            if 'va' in data_name or 'te' in data_name:
                ps = self.tests
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('validation')
                    df = pd.DataFrame()
                    print('Loading val data from %d parquet files' % len(ps))
                    df = self._read_all_data(ps)
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x, y = data[:, :-1], data[:, -1:]
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('validation', (x, y))
                else:
                    return self._parquet_read_generator(
                        ps, feed_mode, batch_size)

            # prediction dataset
            if 'pr' in data_name:
                ps = self.predictions
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('prediction')
                    df = pd.DataFrame()
                    print('Loading pred data from %d parquet files' % len(ps))
                    df = self._read_all_data(ps)
                    df[self.label] = 0  # add a null column as 'label'
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x = data[:, :-1]
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('prediction', x)
                else:
                    return self._parquet_read_generator(
                        ps, feed_mode, batch_size, True)

            raise ValueError('Invalid `data_name`: ' + data_name)

        if self.data_format == 'csv':
            if _get_features:
                if len(self.trains) > 0:
                    p = self.trains[0]
                else:
                    p = self.tests[0]
                tmp = pd.read_csv(p, chunksize=2)
                df_sample = self.dataframe_process(self.args, next(tmp))
                self.features = df_sample.columns.values[:-1]
                self.nb_features = self.features.shape[0]
                return

            # train dataset
            if 'tr' in data_name:
                if self.online_learning:
                    self.trains.sort()
                ps = self.trains
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('train')
                    print('Loading train data from %d csv files' % len(ps))
                    df = self._read_all_data(ps)
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x, y = data[:, :-1], data[:, -1:]
                    x, y = DataLoader._resample_data(self.resample, x, y)
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('train', (x, y))
                else:
                    return self._csv_read_generator(ps, feed_mode, batch_size,
                                                    self.resample)

            # validation dataset
            if 'va' in data_name or 'te' in data_name:
                ps = self.tests
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('validation')
                    print('Loading val data from %d csv files' % len(ps))
                    df = self._read_all_data(ps)
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x, y = data[:, :-1], data[:, -1:]
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('validation', (x, y))
                else:
                    return self._csv_read_generator(ps, feed_mode, batch_size)

            # prediction dataset
            if 'pred' in data_name:
                ps = self.predictions
                if len(ps) == 0:
                    return
                if feed_mode == 'all':
                    if self.load_cache:
                        return self._cache_data('prediction')
                    print('Loading pred data from %d csv files' % len(ps))
                    df = self._read_all_data(ps)
                    df[self.label] = 0
                    df = self.dataframe_process(self.args, df)
                    data = df.values
                    self._check_label(df)
                    x = data[:, :-1]
                    x = self.sep_cols(self.column_emb, x)
                    return self._cache_data('prediction', x)
                else:
                    return self._csv_read_generator(ps, feed_mode, batch_size,
                                                    True)

            raise ValueError('Invalid `data_name`: ' + data_name)
        raise ValueError('Invalid `data_format`: ' + data_format)
Exemplo n.º 27
0
def open_parquet(filename, as_numpy=True):
    table = pq.read_table(filename)
    return from_table(table, as_numpy=as_numpy)
Exemplo n.º 28
0
 def test_impl():
     df = pq.read_table('kde.parquet').to_pandas()
     S = df.points
     return S.sort_values()
Exemplo n.º 29
0
 def _read_col_from_path(self, path):
     # print("reading from path: ", path)
     # return pd.read_parquet(path, **self._read_kwargs)['_']
     return pq.read_table(path, columns=['_']).to_pandas()['_']
Exemplo n.º 30
0
 def test_impl():
     df = pq.read_table('kde.parquet').to_pandas()
     S = df.points
     return S.nsmallest(4)
Exemplo n.º 31
0
 def test_impl():
     df = pq.read_table('kde.parquet').to_pandas()
     S = df.points
     return S.median()
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas()

    if args.output_rsids:
        if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n"
                            "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.")
            return

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_call_filter:
        logging.info("Filtering variants by average call rate")
        features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_r2_filter:
        logging.info("Filtering variants by imputation R2")
        features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_variance_filter:
        logging.info("Filtering variants by (dosage/2)'s variance")
        features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.discard_palindromic_snps:
        logging.info("Discarding palindromic snps")
        features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.rsid_whitelist:
        logging.info("Filtering features annotation for whitelist")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.only_rsids:
        logging.info("discarding non-rsids")
        features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

        if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.info("Keeping only the highest frequency entry for every rsid")
            k = features_metadata[["rsid", "allele_1_frequency", "id"]]
            k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"]
            k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False)
            k = k.groupby("rsid").first().reset_index()
            features_metadata = features_metadata[features_metadata.id.isin(k.id)]
            logging.info("Kept %d", features_metadata.shape[0])
        else:
            logging.info("rsids are unique, no need to restrict to highest frequency entry")

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights, {x for x in features_metadata.id})
        logging.info("Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]
    SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model",
                    "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
                    "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    available_individuals = check_missing(args, data, features)

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i,data_annotation_ in enumerate(data_annotation.itertuples()):
                    if args.MAX_M and  i>=args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id)

                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)
                    else:
                        process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)

    logging.info("Finished")
Exemplo n.º 33
0
from bokeh.embed import server_document
from bokeh.transform import factor_cmap

#################################################################################
# This just loads in the data...
# Alot of this was built of this "cross-fire demo"
# https://github.com/bokeh/bokeh/blob/branch-2.3/examples/app/crossfilter/main.py

start_date = dt.datetime(2017,7,1)
end_date = dt.datetime(2022,3,1)

background = "#ffffff"

file = "./data"+ "/data.parquet"

df = pq.read_table(file).to_pandas()

df.sort_index(inplace=True)

options = df.index.unique(0).to_list()

#print(options)

product = "HS CODE 72, IRON AND STEEL"

level = "US Dollars"

#################################################################################
#These are functions used in the plot...

def growth_trade(foo):
import sys
import os
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.orc as orc
'''
parquet_file = pq.ParquetFile(sys.argv[1])
print(parquet_file.schema)
'''

orc_name = os.path.splitext(sys.argv[1])[0] + ".orc"

table = pq.read_table(sys.argv[1])

print("Writing ", orc_name)
orc.write_table(table, orc_name)
Exemplo n.º 35
0
 def _load(self):
     # might not be optimal, but it works, we can always see if we can
     # do mmapping later on
     table = pq.read_table(self.path)
     self._load_table(table)
Exemplo n.º 36
0
def parquet_read_table(op, client, scope, **kwargs):
    path = client.dictionary[op.name]
    table = pq.read_table(str(path))
    df = table.to_pandas()
    return df
Exemplo n.º 37
0
    def from_file(path: str,
                  file_format: str = None,
                  name: str = None,
                  perform_gzip: bool = True,
                  dtype: dict = None):
        """
		- File is read in with pyarrow, converted to bytes, compressed by default, and stored as a SQLite blob field.
		- Note: If you do not remove your file's index columns before importing them, then they will be included in your Dataset. The ordered nature of this column represents potential bias during analysis. You can drop these and other columns in memory when creating a Featureset from your Dataset.
		- Note: If no column names are provided, then they will be inserted automatically.
		- `path`: Local or absolute path
		- `file_format`: Accepts uncompressed formats including parquet, csv, and tsv (a csv with `delimiter='\t'`). This tag is used to tell pyarrow how to handle the file. We do not infer the path because (a) we don't want to force file extensions, (b) we want to make sure users know what file formats we support.
		- `name`: if none specified, then `path` string will be used.
		- `perform_gzip`: Whether or not to perform gzip compression on the file. We have observed up to 90% compression rates during testing.
		"""

        # create some files with no column names
        # do some testing with sparse null column names...
        # do some testing with all null column names...
        accepted_formats = ['csv', 'tsv', 'parquet']
        if file_format not in accepted_formats:
            print(
                "Error - Accepted file formats include uncompressed csv, tsv, and parquet."
            )
        else:
            # Defaults.
            if name is None:
                name = path
            if perform_gzip is None:
                perform_gzip = True

            #ToDo prevent ff combos like '.csv' with 'parquet' vice versa.

            # File formats.
            if (file_format == 'tsv') or (file_format is None):
                parse_opt = pc.ParseOptions(delimiter='\t')
                tbl = pc.read_csv(path, parse_options=parse_opt)
                file_format = 'tsv'
            elif (file_format == 'csv'):
                parse_opt = pc.ParseOptions(delimiter=',')
                tbl = pc.read_csv(path)
            elif (file_format == 'parquet'):
                tbl = pq.read_table(path)

            #ToDo - handle columns with no name.
            columns = tbl.column_names

            with open(path, "rb") as f:
                bytesio = io.BytesIO(f.read())
                data = bytesio.getvalue()
                if perform_gzip:
                    data = gzip.compress(data)
                    is_compressed = True
                else:
                    is_compressed = False

            d = Dataset.create(name=name,
                               data=data,
                               dtype=dtype,
                               file_format=file_format,
                               is_compressed=is_compressed,
                               columns=columns)
            return d
Exemplo n.º 38
0
 def read(self, file_, extension):
     return parquet.read_table(file_).to_pandas()
    path_processed_data = os.path.join(path_source_data, "ProcessedData")
else:
    raise FileNotFoundError(
        "Define the path_working, path_source_data, gtfs_dir, \
                            ZippedFilesloc, and path_processed_data in a new elif block"
    )

# User-Defined Package
import wmatarawnav as wr
# Globals
AnalysisRoutes = ['79']
ZipParentFolderName = "October 2019 Rawnav"

#1 Analyze Route ---Subset RawNav data.
########################################################################################
FinDat = pq.read_table(source=os.path.join(
    path_processed_data, "Route79_Partition.parquet")).to_pandas()
FinDat.route = FinDat.route.astype('str')
FinDat.drop(
    columns=["SatCnt", 'Blank', 'LatRaw', 'LongRaw', '__index_level_0__'],
    inplace=True)
#Check for duplicate IndexLoc
assert (FinDat.groupby(['filename', 'IndexTripStartInCleanData',
                        'IndexLoc'])['IndexLoc'].count().values.max() == 1)

FinDat.loc[:, "Count"] = FinDat.groupby(
    ['filename', 'IndexTripStartInCleanData',
     'IndexLoc'])['IndexLoc'].transform("count")
FinDatCheck = FinDat[
    FinDat.Count >
    1]  # Check what is happening with the file here :'rawnav06435191012.txt' on Friday.
FinDatCheck.filename.unique()
Exemplo n.º 40
0
 def read(self, path, extension):
     with path.open('rb') as file_:
         return parquet.read_table(file_).to_pandas()
Exemplo n.º 41
0
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import pyarrow as pa

df = pd.read_csv(
    '/Users/sbommireddy/Downloads/business-price-indexes-september-2019-quarter-csv.csv'
)
table = pa.Table.from_pandas(df, preserve_index=False)
#write parquet file
pq.write_table(table, 'business-price-indexes-september-2019-quarter.parquet')

t = pq.read_table('business-price-indexes-september-2019-quarter.parquet')

print(t.to_pandas())

parquet_file1 = pq.ParquetFile(
    'business-price-indexes-september-2019-quarter.parquet')
print("Print metadata")
print(parquet_file1.metadata)
Exemplo n.º 42
0
    # 2.1. Read-in Data
    ###################
    # Reduce rawnav data to runs present in the summary file after filtering.

    xwalk_seg_pattern_stop_fil = xwalk_seg_pattern_stop.query(
        'seg_name_id == @seg')

    seg_routes = list(xwalk_seg_pattern_stop_fil.route.drop_duplicates())

    rawnav_dat = (wr.read_cleaned_rawnav(
        analysis_routes_=seg_routes,
        path=os.path.join(path_processed_data, "rawnav_data.parquet")).drop(
            columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt']))

    segment_summary = (pq.read_table(source=os.path.join(
        path_processed_data, "segment_summary_2017_test.parquet"),
                                     filters=[['seg_name_id', "=", seg]],
                                     use_pandas_metadata=True).to_pandas())

    segment_summary_fil = (segment_summary.query('~(flag_too_far_any\
                  | flag_wrong_order_any\
                  | flag_too_long_odom\
                  | flag_secs_total_mismatch\
                  | flag_odom_total_mismatch)'))

    stop_index = (
        pq.read_table(source=os.path.join(path_processed_data,
                                          "stop_index.parquet"),
                      filters=[[('route', '=', route)]
                               for route in seg_routes],
                      columns=[
                          'seg_name_id', 'route', 'pattern', 'stop_id',
Exemplo n.º 43
0
def prepData(s_data):
    train_x = []
    test_x = []
    train_y = []
    test_y = []
    p_flg = 0
    flg = 0
    totalcnt = 0
    df = pq.read_table(s_data).to_pandas()
    df = df[['signal', 'originalsize']]

    samplecnt = 0
    for path in paths:

        table = pq.read_table(path)
        df = table.to_pandas()
        #         print(df)
        df = df[['signal', 'originalsize']]

        for idx, row in df.iterrows():

            flg = samplecnt

            signal = np.array(list(row[0]))
            signal = zeropadding10(signal)
            signal = np.array(signal)
            signal = signal.astype('float32') / 255.

            originalsize = np.array(extendAry(row[1]))
            originalsize = zeropadding10(originalsize)

            if cnt == 0:
                print(path.replace(s_data + "/", ""))
                fmer = path.replace(s_data + "/", "").replace(".pq", "")
                plt.title(fmer)
                plt.plot(signal)
                #                 plt.show()
                #                 fig = plt.figure(fmer)
                plt.savefig("/groups2/gac50430/nanopore/dataset4DL/figs/" + fmer + ".png")
                plt.clf()
            #                 plt.plot(originalsize)
            #                 plt.show()

            testidx = (idx % 12 >= 10)
            if testidx:
                test_x.append(signal)
                test_x.append(originalsize)
                test_y.append(flg)
            else:
                train_x.append(signal)
                train_x.append(originalsize)
                train_y.append(flg)

            cnt = cnt + 1
            totalcnt = totalcnt + 1

            if cnt % 12000 == 0:
                print(samplecnt, totalcnt, path, totalcnt, idx, row)
            if cnt == 36000:
                break

        samplecnt = samplecnt + 1

    print("totalcnt", totalcnt)

    train_x = np.array(train_x)
    test_x = np.array(test_x)
    train_y = np.array(train_y)
    test_y = np.array(test_y)
    num_classes = np.unique(train_y).size

    print("train_x.shape", train_x.shape)
    print("test_x.shape", train_x.shape)
    print("train_y.shape", train_x.shape)
    print("test_y.shape", train_x.shape)

    print(num_classes, 'classes')

    print('y_train shape:', train_y.shape)
    print('y_test shape:', test_y.shape)

    train_x = np.reshape(train_x, (-1, DATA_LENGTH, 2))
    test_x = np.reshape(test_x, (-1, DATA_LENGTH, 2))
    train_y = np.reshape(train_y, (-1, 1,))
    test_y = np.reshape(test_y, (-1, 1,))

    test_y = test_y - 1
    train_y = train_y - 1
    train_y = keras.utils.to_categorical(train_y, num_classes)
    test_y = keras.utils.to_categorical(test_y, num_classes)

    print('train_x:', train_x.shape)
    print('train_y:', train_y.shape)
    print('test_x shape:', test_x.shape)
    print('test_y shape:', test_y.shape)

    return train_x, test_x, train_y, test_y, num_classes
Exemplo n.º 44
0
 def test_impl():
     df = pq.read_table("groupby3.pq").to_pandas()
     A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min())
     return A.sum()
Exemplo n.º 45
0
def test_read_non_existent_file(tempdir, use_legacy_dataset):
    path = 'non-existent-file.parquet'
    try:
        pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
    except Exception as e:
        assert path in e.args[0]
Exemplo n.º 46
0
import pyarrow.parquet as pq
import pandas as pd
table = pq.read_table('/root/Downloads/BGU_PROJECT/data.parquet')
table.to_pandas()

from keras import regularizers
from keras.layers.core import Dropout
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Embedding, Dense, LSTM
from string import printable
from keras.preprocessing import sequence
from python_utils import save_model, load_model
from keras.utils.vis_utils import plot_model
from keras.callbacks import CSVLogger
import pyarrow.parquet as pq
import pandas as pd
table = pq.read_table('/root/Downloads/BGU_PROJECT/data.parquet')
table.to_pandas()


class LSTMC:
    def __init__(self,
                 max_len=75,
                 emb_dim=32,
                 max_vocab_len=100,
                 lstm_output_size=32,
                 w_reg=regularizers.l2(1e-4)):
        super().__init__()
        self.max_len = max_len
        self.csv_logger = CSVLogger('table', append=True, separator=';')
Exemplo n.º 47
0
Arquivo: 1.py Projeto: interma/misc
# coding: utf-8

'''
CREATE TABLE user_pq (
id bigint NOT NULL DEFAULT 0,
name varchar,
age int
) with (appendonly=true, orientation=parquet);
insert into user_pq(id,name,age) values (generate_series(1,1000), 'interma', trunc(random() * 99 + 1));
hawq extract -d postgres -o user_pq.yml test_sa.user_pq
'''

import pyarrow.parquet as pq
t = pq.read_table('user_pq.parquet')
t = pq.read_table('./user_pq.parquet')
t = pq.read_table('./user_pq.parquetxx')
t = pq.read_table('./user_pq.parquet')
t = pq.read_table('./user_pq.parquet')
t
pq.ParquetFile('user_pq.parquet')
f = pq.ParquetFile('user_pq.parquet')
f
f.metadarta
f.metadata
f.schema
f..read_row_group(0)
f.read_row_group(0)
f.read_row_group(1)
f.read_row_group(0)
help
help()
Exemplo n.º 48
0
def test_export_parquet(tmpdir_factory):
    """Test export of DataFrame to parquet"""
    Settings.tidy = False
    Settings.humanize = True
    Settings.si_units = False

    # Request data.
    request = DwdObservationRequest(
        parameter=DwdObservationDataset.CLIMATE_SUMMARY,
        resolution=DwdObservationResolution.DAILY,
        start_date="2019",
        end_date="2020",
    ).filter_by_station_id(
        station_id=[1048],
    )

    df = request.values.all().df

    # Save to Parquet file.
    filename = tmpdir_factory.mktemp("data").join("observations.parquet")
    ExportMixin(df=df).to_target(f"file://{filename}")

    # Read back Parquet file.
    table = pq.read_table(filename)

    # Validate dimensions.
    assert table.num_columns == 19
    assert table.num_rows == 366

    # Validate column names.
    assert table.column_names == [
        "station_id",
        "dataset",
        "date",
        "qn_3",
        "wind_gust_max",
        "wind_speed",
        "qn_4",
        "precipitation_height",
        "precipitation_form",
        "sunshine_duration",
        "snow_depth",
        "cloud_cover_total",
        "pressure_vapor",
        "pressure_air_site",
        "temperature_air_mean_200",
        "humidity",
        "temperature_air_max_200",
        "temperature_air_min_200",
        "temperature_air_min_005",
    ]

    # Validate content.
    data = table.to_pydict()

    assert data["date"][0] == datetime.datetime(2019, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
    assert data["temperature_air_min_005"][0] == 1.5
    assert data["date"][-1] == datetime.datetime(2020, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
    assert data["temperature_air_min_005"][-1] == -4.6

    os.unlink(filename)
Exemplo n.º 49
0
def df_to_parquet_table(df: pd.DataFrame) -> pa.Table:
    """ Convert DataFrame to Pyarrow Table
    Example:
    pyarrow.Table
    MGLT: string
    cargo_capacity: string
    consumables: string
    cost_in_credits: string
    created: string
    crew: string
    edited: string
    films: string
    hyperdrive_rating: string
    length: string
    manufacturer: string
    max_atmosphering_speed: string
    model: string
    name: string
    passengers: string
    pilots: double
    starship_class: string
    url: string
    __index_level_0__: int64
    metadata
    --------
    {b'pandas': b'{"columns": [{"field_name": "MGLT", "pandas_type": "unicode", "m'
            b'etadata": null, "name": "MGLT", "numpy_type": "object"}, {"field'
            b'_name": "cargo_capacity", "pandas_type": "unicode", "metadata": '
            b'null, "name": "cargo_capacity", "numpy_type": "object"}, {"field'
            b'_name": "consumables", "pandas_type": "unicode", "metadata": nul'
            b'l, "name": "consumables", "numpy_type": "object"}, {"field_name"'
            b': "cost_in_credits", "pandas_type": "unicode", "metadata": null,'
            b' "name": "cost_in_credits", "numpy_type": "object"}, {"field_nam'
            b'e": "created", "pandas_type": "unicode", "metadata": null, "name'
            b'": "created", "numpy_type": "object"}, {"field_name": "crew", "p'
            b'andas_type": "unicode", "metadata": null, "name": "crew", "numpy'
            b'_type": "object"}, {"field_name": "edited", "pandas_type": "unic'
            b'ode", "metadata": null, "name": "edited", "numpy_type": "object"'
            b'}, {"field_name": "films", "pandas_type": "unicode", "metadata":'
            b' null, "name": "films", "numpy_type": "object"}, {"field_name": '
            b'"hyperdrive_rating", "pandas_type": "unicode", "metadata": null,'
            b' "name": "hyperdrive_rating", "numpy_type": "object"}, {"field_n'
            b'ame": "length", "pandas_type": "unicode", "metadata": null, "nam'
            b'e": "length", "numpy_type": "object"}, {"field_name": "manufactu'
            b'rer", "pandas_type": "unicode", "metadata": null, "name": "manuf'
            b'acturer", "numpy_type": "object"}, {"field_name": "max_atmospher'
            b'ing_speed", "pandas_type": "unicode", "metadata": null, "name": '
            b'"max_atmosphering_speed", "numpy_type": "object"}, {"field_name"'
            b': "model", "pandas_type": "unicode", "metadata": null, "name": "'
            b'model", "numpy_type": "object"}, {"field_name": "name", "pandas_'
            b'type": "unicode", "metadata": null, "name": "name", "numpy_type"'
            b': "object"}, {"field_name": "passengers", "pandas_type": "unicod'
            b'e", "metadata": null, "name": "passengers", "numpy_type": "objec'
            b't"}, {"field_name": "pilots", "pandas_type": "float64", "metadat'
            b'a": null, "name": "pilots", "numpy_type": "float64"}, {"field_na'
            b'me": "starship_class", "pandas_type": "unicode", "metadata": nul'
            b'l, "name": "starship_class", "numpy_type": "object"}, {"field_na'
            b'me": "url", "pandas_type": "unicode", "metadata": null, "name": '
            b'"url", "numpy_type": "object"}, {"field_name": "__index_level_0_'
            b'_", "pandas_type": "int64", "metadata": null, "name": null, "num'
            b'py_type": "int64"}], "column_indexes": [{"field_name": null, "pa'
            b'ndas_type": "unicode", "metadata": {"encoding": "UTF-8"}, "name"'
            b': null, "numpy_type": "object"}], "pandas_version": "0.22.0", "i'
            b'ndex_columns": ["__index_level_0__"]}'}
    """
    pyarrow_deathstar_table = pa.Table.from_pandas(df)  # Create PyArrow Table from Pandas DF
    print(pyarrow_deathstar_table)
    pq.write_table(pyarrow_deathstar_table, 'deathstar.parquet')  # Convert PyArrow Table to Parquet Table / File
    parquet_table = pq.read_table('deathstar.parquet')  # Read back Parquet File as a Table
    parquet_table = pq.ParquetFile('deathstar.parquet')  # Read back Parquet File as a ParquetFile for finer-grained read and write
    print(parquet_table.metadata)
    #<pyarrow._parquet.FileMetaData object at 0x7fb755c29458>
    #  created_by: parquet-cpp version 1.4.1-SNAPSHOT
    #  num_columns: 19
    #  num_rows: 1
    #  num_row_groups: 1
    #  format_version: 1.0
    #  serialized_size: 4574

    print(parquet_table.schema)
    #<pyarrow._parquet.ParquetSchema object at 0x7efc80565408>
    #MGLT: BYTE_ARRAY UTF8
    #cargo_capacity: BYTE_ARRAY UTF8
    #consumables: BYTE_ARRAY UTF8
    #cost_in_credits: BYTE_ARRAY UTF8
    #created: BYTE_ARRAY UTF8
    #crew: BYTE_ARRAY UTF8
    #edited: BYTE_ARRAY UTF8
    #films: BYTE_ARRAY UTF8
    #hyperdrive_rating: BYTE_ARRAY UTF8
    #length: BYTE_ARRAY UTF8
    #manufacturer: BYTE_ARRAY UTF8
    #max_atmosphering_speed: BYTE_ARRAY UTF8
    #model: BYTE_ARRAY UTF8
    #name: BYTE_ARRAY UTF8
    #passengers: BYTE_ARRAY UTF8
    #pilots: DOUBLE
    #starship_class: BYTE_ARRAY UTF8
    #url: BYTE_ARRAY UTF8
    #__index_level_0__: INT64
    return parquet_table
Exemplo n.º 50
0
    exit()

# %% Convert Stata file to parquet
# if False:
# 	# %% Initial data load
# 	logger.info('Reading Stata File')
# 	data = pd.read_stata(
# '/Users/janschaefer/Dropbox/10_Thesis/02_Data/save_step4.dta')
#
# 	# %% writing parquet
# 	logger.info('Writing parquet to disk.')
# 	pq.write_table(pa.Table.from_pandas(data), dataPath+'/data.parquet')

# %% Initial data load
logger.info("Reading Parquet File")
data = pq.read_table(dataPath + "/data.parquet").to_pandas()

# %% Data Summary
# logger.info('Data with types:\n%s', data.dtypes)
logger.info("Dimensions of data: %s", data.shape)
# data = data.head(1000)

# %%  create needTranslation subset

file = open(filePath + "/iterate.txt", "r")
iterate = file.read()
file.close()

file = open("iterate.bak", "w")
file.write(str(iterate))
file.close()
Exemplo n.º 51
0
def _read_table(*args, **kwargs):
    import pyarrow.parquet as pq

    table = pq.read_table(*args, **kwargs)
    table.validate(full=True)
    return table
Exemplo n.º 52
0
def read_parquet(filename):
    print(f'Loading {filename}')
    table = pq.read_table(filename)
    return table.to_pandas()
Exemplo n.º 53
0
    pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms')


if __name__ == '__main__':
    schema = pa.schema([
        pa.field('name', pa.string()),
        pa.field('labels', pa.list_(pa.string())),
        pa.field('created', pa.timestamp('ms')),
        pa.field('valid', pa.bool_()),
        pa.field('status', pa.int64()),
    ])
    data = [
        {
            'name': 'a',
            'labels': ['A', 'B'],
            'created': int(28800000 + 1000 * time.mktime(datetime.datetime(2018, 8, 1).date().timetuple())),
            'valid': True,
            'status': 1,
        },
        {
            'name': 'b',
            'labels': ['B', 'C'],
            'created': int(28800000 + 1000 * time.mktime(datetime.datetime(2018, 8, 2).date().timetuple())),
            'valid': False,
            'status': 2,
        },
    ]
    json_to_parquet(data, 'a', schema)
    table2 = pq.read_table('a')
    print table2.to_pandas()
Exemplo n.º 54
0
def _read_table(*args, **kwargs):
    import pyarrow.parquet as pq
    return pq.read_table(*args, **kwargs)
Exemplo n.º 55
0
def _read_table(*args, **kwargs):
    import pyarrow.parquet as pq
    return pq.read_table(*args, **kwargs)
Exemplo n.º 56
0
from flask import Flask, render_template, flash, request
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField
import pandas as pd
import pyarrow.parquet as pq
import json

# app configuration
DEBUG = True
app = Flask(__name__)
app.config.from_object(__name__)
app.config['SECRET_KEY'] = '5i11yg00s3'

PQPATH = '/Users/lxu213/data/ad-free-search-engine/spark-warehouse/updated_adwords/tf_idf.parquet'
data = pq.read_table(PQPATH, nthreads=4).to_pandas()


class SearchBox(Form):
    query = TextField(validators=[validators.required()])


@app.route("/", methods=['GET', 'POST'])
def hello():
    form = SearchBox(request.form)
    print form.errors
    query = ''
    kw_dict = []

    if request.method == 'POST':
        query = request.form['query']
        kw_data = data[['val', 'tf-idf']].loc[data['keywords'].isin(
            query.lower().split())][:50]
    model = Ridge(fit_intercept=True)
    X = x[:, np.newaxis]
    model.fit(X, y)

    if (mute == False):
        print(
            f"\t coef_: {model.coef_[0]:.05f}, int_: {model.intercept_:.05f}")

    return model


if __name__ == '__main__':
    # Use ridge regression method to estimate the
    # slope of the cloud size distribution
    df = pq.read_table(f'tracking/clouds_{120:08d}.pq', nthreads=6).to_pandas()

    df_size = get_cloud_area(df)
    model = calc_cloud_slope(df_size)

    hist, bin_edges = np.histogram(df_size['area'], bins='fd')
    m_ = (hist > 0)
    x, y = np.log10(bin_edges[1:][m_]), np.log10(hist[m_])

    #---- Plotting
    fig = plt.figure(1, figsize=(3, 3))
    fig.clf()
    sns.set_context('paper')
    sns.set_style(
        'ticks', {
            'axes.grid': False,
Exemplo n.º 58
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 19 03:18:19 2020

@author: WylieTimmerman
"""
import pandas as pd, os, numpy as np, pyproj, sys, zipfile, glob, logging
import pyarrow as pa
import pyarrow.parquet as pq

path_sp = r"C:\OD\Foursquare ITP\Foursquare ITP SharePoint Site - Shared Documents\WMATA Queue Jump Analysis"
path_processed_data = os.path.join(path_sp,
                                   r"Client Shared Folder\data\02-processed")
path_interim_data = os.path.join(path_sp,
                                 r"Client Shared Folder\data\01-interim")

FinDat = pq.read_table(source=os.path.join(
    path_interim_data, "Route79_Partition_20200519.parquet")).to_pandas()

FinDat.to_csv(os.path.join(path_interim_data, "Route79_20200519.csv"))