def test_pandas_parquet_configuration_options(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: pq.write_table(arrow_table, filename.strpath, version="2.0", use_dictionary=use_dictionary) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) for compression in ['NONE', 'SNAPPY', 'GZIP']: pq.write_table(arrow_table, filename.strpath, version="2.0", compression=compression) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_read_non_existent_file(tmpdir): import pyarrow.parquet as pq path = 'non-existent-file.parquet' try: pq.read_table(path) except Exception as e: assert path in e.args[0]
def test_creation(parquet): # we have existing files in our dir d = parquet.client.root assert len(list(d.iterdir())) == 1 pqd = d / 'pq' assert len(list(pqd.iterdir())) == 2 assert len(pq.read_table(str(pqd / 'open.parquet'))) == 50 assert len(pq.read_table(str(pqd / 'close.parquet'))) == 50
def test_multithreaded_read(): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() pq.write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) table1 = pq.read_table(buf, nthreads=4) buf.seek(0) table2 = pq.read_table(buf, nthreads=1) assert table1.equals(table2)
def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # Pandas only support ns resolution, Arrow at the moment only ms 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath, version="1.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read)
def test_column_of_lists(tmpdir): df, schema = dataframe_with_arrays() filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema) pq.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_2_0_rountrip(tmpdir): df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) pq.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = A.from_pandas_dataframe(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
def read_parquet(pqfile=os.path.join(_mydir, 'parquet/nrows=all'), method='spark'): if method == 'spark': from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() return spark.read.parquet(pqfile) elif method == 'pyarrow': return pq.read_table(pqfile) else: raise Exception('bad method {}'.format(method))
def test_parquet_nested_convenience(tmpdir): # ARROW-1684 import pyarrow.parquet as pq df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], 'b': [[1.], None, None, [6., 7.]], }) path = str(tmpdir / 'nested_convenience.parquet') table = pa.Table.from_pandas(df, preserve_index=False) _write_table(table, path) read = pq.read_table(path, columns=['a']) tm.assert_frame_equal(read.to_pandas(), df[['a']]) read = pq.read_table(path, columns=['a', 'b']) tm.assert_frame_equal(read.to_pandas(), df)
def test_to_parquet_default_writes_nulls(tmpdir): check_fastparquet() check_pyarrow() fn = str(tmpdir.join('test.parquet')) df = pd.DataFrame({'c1': [1., np.nan, 2, np.nan, 3]}) ddf = dd.from_pandas(df, npartitions=1) ddf.to_parquet(fn) table = pq.read_table(fn) assert table[1].null_count == 2
def load_raw(): # note manually removed some bad row kwargs = get_pandas_read_csv_defaults() kwargs['thousands'] = ',' # always do this kwargs['parse_dates'] = ['Date'] kwargs['na_values'] = ['-'] kwargs['dtype'] = 'str' dtype = { 'Close': 'float', 'High': 'float', 'Low': 'float', 'Market Cap': 'float', 'Open': 'float', 'Volume': 'float' } meta = pd.read_csv(os.path.join(_mydir, 'Top100Cryptos/data/100 List.csv')) names = meta.Name.tolist() files = [os.path.join(_mydir, 'Top100Cryptos/data/{}.csv'.format(x)) for x in names] # files = glob.glob(os.path.join(_mydir, 'Top100Cryptos/data/*.csv')) dfs = list() datadir = os.path.join(_mydir, 'parsed') if not os.path.exists(datadir): os.makedirs(datadir) for i, (name, f) in enumerate(zip(names, files)): mtime = os.path.getmtime(f) dirname = os.path.join(datadir, 'name={}/mtime={}'.format(name, mtime)) filename = os.path.join(dirname, 'data.parquet') if not os.path.exists(filename): df = pd.read_csv(f, **kwargs) df = pa.Table.from_pandas(df) if not os.path.exists(dirname): os.makedirs(dirname) print('writing {}'.format(filename)) pq.write_table(df, filename) pq.read_table('./parsed') # test else: print('{} exists'.format(filename)) return pq.read_table('./parsed') # test
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) result = pq.read_table(path) pdt.assert_frame_equal(result.to_pandas() .sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) buf = io.BytesIO() pq.write_table(table, buf, chunk_size=-1) buf.seek(0) result = pq.read_table(buf) assert result.equals(table) with pytest.raises(ValueError): pq.write_table(table, buf, chunk_size=0)
def read_parquet(fn): """ read parquet file with Spark """ print("Loading parquest file: %s..."% fn) file_name = 'parquet_sample.dat' read_parquest(file_name) fn = 'sample.parquet' tbl = pq.read_table(fn) df = tbl.to_pandas() d=df.iloc[:, 0:3] table = pa.Table.from_pandas(d) pq.write_table(table, 'example.parquet') pass
def test_pandas_column_selection(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath) table_read = pq.read_table(filename.strpath, columns=['uint8']) df_read = table_read.to_pandas() pdt.assert_frame_equal(df[['uint8']], df_read)
def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: filename = tmpdir.join('single_{}_column.parquet' .format(dtype.__name__)) data = [A.from_pylist(list(map(dtype, range(5))))] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') A.parquet.write_table(table, filename.strpath) table_read = pq.read_table(filename.strpath) for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): assert col_written.name == col_read.name assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) assert data_written.equals(data_read)
def write_parquet_table_as_partitioned_dataset(parquet_file) -> pq.ParquetDataset: """ Write a parquet table as a parititioned dataset (i.e. multiple Parquet files) An example of a dataset partitioned by year and month on disk might look like: dataset_name/ year=2018/ month=09/ 0.parq 1.parq month=10/ 0.parq 1.parq """ parquet_table = pq.read_table(parquet_file) # Read back Parquet File as a Table #pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['created']) pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['year', 'month', 'day'], flavor='spark') dataset = pq.ParquetDataset('starships') return dataset
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path) result = pq.read_table(path).to_pandas() pdt.assert_frame_equal(result, df)
def test_pandas_parquet_pyfile_roundtrip(tmpdir): filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath size = 5 df = pd.DataFrame({ 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': ['foo', 'bar', None, 'baz', 'qux'] }) arrow_table = A.from_pandas_dataframe(df) with open(filename, 'wb') as f: A.parquet.write_table(arrow_table, f, version="1.0") data = io.BytesIO(open(filename, 'rb').read()) table_read = pq.read_table(data) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def _read_map_parquet(healsparse_class, filepath, pixels=None, header=False, degrade_nside=None, weightfile=None, reduction='mean', use_threads=False): """ Internal function to read in a HealSparseMap from a parquet dataset. Parameters ---------- healsparse_class : `type` Type value of the HealSparseMap class. filepath : `str` Name of the file path to read. Must be a parquet dataset. pixels : `list`, optional List of coverage map pixels to read. header : `bool`, optional Return the parquet metadata as well as map? Default is False. degrade_nside : `int`, optional Degrade map to this nside on read. None means leave as-is. Not yet implemented for parquet. weightfile : `str`, optional Floating-point map to supply weights for degrade wmean. Must be a HealSparseMap (weighted degrade not supported for healpix degrade-on-read). Not yet implemented for parquet. reduction : `str`, optional Reduction method with degrade-on-read. (mean, median, std, max, min, and, or, sum, prod, wmean). Not yet implemented for parquet. use_threads : `bool`, optional Use multithreaded reading. Returns ------- healSparseMap : `HealSparseMap` HealSparseMap from file, covered by pixels header : `astropy.io.fits.Header` (if header=True) Header metadata for the map file. """ ds = dataset.dataset(filepath, format='parquet', partitioning='hive') schema = ds.schema # Convert from byte strings md = {key.decode(): schema.metadata[key].decode() for key in schema.metadata} if 'healsparse::filetype' not in md: raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) if md['healsparse::filetype'] != 'healsparse': raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) cov_fname = os.path.join(filepath, '_coverage.parquet') if not os.path.isfile(cov_fname): # Note that this could be reconstructed from the information in the file # inefficiently. This feature could be added in the future. raise RuntimeError("Filepath %s is missing coverage map %s" % (filepath, cov_fname)) nside_sparse = int(md['healsparse::nside_sparse']) nside_coverage = int(md['healsparse::nside_coverage']) nside_io = int(md['healsparse::nside_io']) bitshift_io = _compute_bitshift(nside_io, nside_coverage) cov_tab = parquet.read_table(cov_fname, use_threads=use_threads) cov_pixels = cov_tab['cov_pix'].to_numpy() row_groups = cov_tab['row_group'].to_numpy() if pixels is not None: _pixels = np.atleast_1d(pixels) if len(np.unique(_pixels)) < len(_pixels): raise RuntimeError("Input list of pixels must be unique.") sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0, cov_pixels.size - 1) ok, = np.where(cov_pixels[sub] == _pixels) if ok.size == 0: raise RuntimeError("None of the specified pixels are in the coverage map.") _pixels = np.sort(_pixels[ok]) _pixels_io = np.right_shift(_pixels, bitshift_io) # Figure out row groups... matches = np.searchsorted(cov_pixels, _pixels) _row_groups_io = row_groups[matches] else: _pixels = cov_pixels _pixels_io = None _row_groups_io = None cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse, _pixels) if md['healsparse::widemask'] == 'True': is_wide_mask = True wmult = int(md['healsparse::wwidth']) else: is_wide_mask = False wmult = 1 if md['healsparse::primary'] != '': # This is a multi-column table. is_rec_array = True primary = md['healsparse::primary'] columns = [name for name in schema.names if name not in ['iopix', 'cov_pix']] dtype = [(name, schema.field(name).type.to_pandas_dtype()) for name in columns] primary_dtype = schema.field(primary).type.to_pandas_dtype() else: is_rec_array = False primary = None dtype = schema.field('sparse').type.to_pandas_dtype() primary_dtype = dtype columns = ['sparse'] if md['healsparse::sentinel'] == 'UNSEEN': sentinel = primary_dtype(hp.UNSEEN) else: sentinel = primary_dtype(md['healsparse::sentinel']) if is_integer_value(sentinel): sentinel = int(sentinel) else: sentinel = float(sentinel) if is_rec_array: sparse_map = np.zeros((_pixels.size + 1)*cov_map.nfine_per_cov, dtype=dtype) # Fill in the overflow (primary) sparse_map[primary][: cov_map.nfine_per_cov] = sentinel # Fill in the overflow (not primary) for d in dtype: if d[0] == primary: continue sparse_map[d[0]][: cov_map.nfine_per_cov] = check_sentinel(d[1], None) else: sparse_map = np.zeros((_pixels.size + 1)*cov_map.nfine_per_cov*wmult, dtype=dtype) sparse_map[: cov_map.nfine_per_cov*wmult] = sentinel if _pixels_io is None: # Read the full table tab = ds.to_table(columns=columns, use_threads=use_threads) else: _pixels_io_unique = list(np.unique(_pixels_io)) fragments = list(ds.get_fragments(filter=dataset.field('iopix').isin(_pixels_io_unique))) group_fragments = [] for pixel_io, fragment in zip(_pixels_io_unique, fragments): groups = fragment.split_by_row_group() # Only append groups that are relevant use, = np.where(_pixels_io == pixel_io) for ind in use: group_fragments.append(groups[_row_groups_io[ind]]) ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format) tab = ds2.to_table(columns=columns, use_threads=use_threads) if is_rec_array: for name in columns: sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy() else: sparse_map[cov_map.nfine_per_cov*wmult:] = tab['sparse'].to_numpy() if is_wide_mask: sparse_map = sparse_map.reshape((sparse_map.size // wmult, wmult)).astype(WIDE_MASK) healsparse_map = healsparse_class(cov_map=cov_map, sparse_map=sparse_map, nside_sparse=nside_sparse, primary=primary, sentinel=sentinel) if header: if 'healsparse::header' in md: hdr_string = md['healsparse::header'] hdr = fits.Header.fromstring(hdr_string) else: hdr = fits.Header() return (healsparse_map, hdr) else: return healsparse_map
# If you have any questions, suggestions, or comments on this example, # please use the HDF-EOS Forum (http://hdfeos.org/forums). # # If you would like to see an example of any other NASA HDF/HDF-EOS data # product, feel free to contact us at [email protected] or # post it at the HDF-EOS Forum (http://hdfeos.org/forums). # # This script was tested on Mac OS X Mavericks machine with the latest # parquet and arrow compiled from GitHub repository. # # Last tested: 9/22/2016 # Author: Hyo-Kyung Lee import pyarrow as A import pyarrow.parquet as pq import pandas as pd import h5py FILE_NAME='/tmp/GSSTF_NCEP.3.1987.07.01.he5' with h5py.File(FILE_NAME, mode='r') as f: dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST'] values = dset_var[0,:] data = {} data['i4'] = values.astype('i4') filename='GSSTF.parquet' df=pd.DataFrame(data) arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename, version="2.0") table_read = pq.read_table(filename) df_read = table_read.to_pandas() print(df_read)
def test_read_non_existing_file(use_legacy_dataset): # ensure we have a proper error message with pytest.raises(FileNotFoundError): pq.read_table('i-am-not-existing.parquet')
def test_read_table_doesnt_warn(datadir, use_legacy_dataset): with pytest.warns(None) as record: pq.read_table(datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) assert len(record) == 0
def load_data(self, data_name='train', batch_size=4096, _get_features=False, feed_mode='batch'): """Interface to load training, test or prediction data, called from model_manager. # Arguments data_name: phase of model to feed data, can be 'train', 'test' or 'validation', or 'prediction'. Using two initials of them also works, such as 'tr' for 'train'. batch_size: number of data in each batch, only important for training phase. _get_features: get number of features. feed_mode: can be 'batch' which load all the data only once and read them by batch to avoid RAM insufficiency, 'generator' which load data by samll batch and repeat infinitely over the whole data, or 'all' which load all the data into memory. # Return Compiled Keras model """ self.batch_size = batch_size if self.data_format == 'parquet': if _get_features: if len(self.trains) > 0: p = self.trains[0] else: p = self.tests[0] tmp = pq.read_table(p).to_pandas() df_sample = self.dataframe_process(self.args, tmp) self.features = df_sample.columns.values[:-1] self.nb_features = self.features.shape[0] return # train dataset if 'tr' in data_name: if self.online_learning: self.trains.sort() ps = self.trains if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('train') print('Loading train data from %d parquet files' % len(ps)) df = self._read_all_data(ps) df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x, y = data[:, :-1], data[:, -1:] x, y = DataLoader._resample_data(self.resample, x, y) x = self.sep_cols(self.column_emb, x) return self._cache_data('train', (x, y)) else: return self._parquet_read_generator( ps, feed_mode, batch_size, self.resample) # validation dataset if 'va' in data_name or 'te' in data_name: ps = self.tests if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('validation') df = pd.DataFrame() print('Loading val data from %d parquet files' % len(ps)) df = self._read_all_data(ps) df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x, y = data[:, :-1], data[:, -1:] x = self.sep_cols(self.column_emb, x) return self._cache_data('validation', (x, y)) else: return self._parquet_read_generator( ps, feed_mode, batch_size) # prediction dataset if 'pr' in data_name: ps = self.predictions if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('prediction') df = pd.DataFrame() print('Loading pred data from %d parquet files' % len(ps)) df = self._read_all_data(ps) df[self.label] = 0 # add a null column as 'label' df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x = data[:, :-1] x = self.sep_cols(self.column_emb, x) return self._cache_data('prediction', x) else: return self._parquet_read_generator( ps, feed_mode, batch_size, True) raise ValueError('Invalid `data_name`: ' + data_name) if self.data_format == 'csv': if _get_features: if len(self.trains) > 0: p = self.trains[0] else: p = self.tests[0] tmp = pd.read_csv(p, chunksize=2) df_sample = self.dataframe_process(self.args, next(tmp)) self.features = df_sample.columns.values[:-1] self.nb_features = self.features.shape[0] return # train dataset if 'tr' in data_name: if self.online_learning: self.trains.sort() ps = self.trains if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('train') print('Loading train data from %d csv files' % len(ps)) df = self._read_all_data(ps) df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x, y = data[:, :-1], data[:, -1:] x, y = DataLoader._resample_data(self.resample, x, y) x = self.sep_cols(self.column_emb, x) return self._cache_data('train', (x, y)) else: return self._csv_read_generator(ps, feed_mode, batch_size, self.resample) # validation dataset if 'va' in data_name or 'te' in data_name: ps = self.tests if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('validation') print('Loading val data from %d csv files' % len(ps)) df = self._read_all_data(ps) df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x, y = data[:, :-1], data[:, -1:] x = self.sep_cols(self.column_emb, x) return self._cache_data('validation', (x, y)) else: return self._csv_read_generator(ps, feed_mode, batch_size) # prediction dataset if 'pred' in data_name: ps = self.predictions if len(ps) == 0: return if feed_mode == 'all': if self.load_cache: return self._cache_data('prediction') print('Loading pred data from %d csv files' % len(ps)) df = self._read_all_data(ps) df[self.label] = 0 df = self.dataframe_process(self.args, df) data = df.values self._check_label(df) x = data[:, :-1] x = self.sep_cols(self.column_emb, x) return self._cache_data('prediction', x) else: return self._csv_read_generator(ps, feed_mode, batch_size, True) raise ValueError('Invalid `data_name`: ' + data_name) raise ValueError('Invalid `data_format`: ' + data_format)
def open_parquet(filename, as_numpy=True): table = pq.read_table(filename) return from_table(table, as_numpy=as_numpy)
def test_impl(): df = pq.read_table('kde.parquet').to_pandas() S = df.points return S.sort_values()
def _read_col_from_path(self, path): # print("reading from path: ", path) # return pd.read_parquet(path, **self._read_kwargs)['_'] return pq.read_table(path, columns=['_']).to_pandas()['_']
def test_impl(): df = pq.read_table('kde.parquet').to_pandas() S = df.points return S.nsmallest(4)
def test_impl(): df = pq.read_table('kde.parquet').to_pandas() S = df.points return S.median()
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas() if args.output_rsids: if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n" "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.") return if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window) logging.info("Kept %d", features_metadata.shape[0]) if args.variant_call_filter: logging.info("Filtering variants by average call rate") features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_r2_filter: logging.info("Filtering variants by imputation R2") features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_variance_filter: logging.info("Filtering variants by (dosage/2)'s variance") features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)] logging.info("Kept %d", features_metadata.shape[0]) if args.discard_palindromic_snps: logging.info("Discarding palindromic snps") features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.rsid_whitelist: logging.info("Filtering features annotation for whitelist") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)] logging.info("Kept %d", features_metadata.shape[0]) if args.only_rsids: logging.info("discarding non-rsids") features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.info("Keeping only the highest frequency entry for every rsid") k = features_metadata[["rsid", "allele_1_frequency", "id"]] k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False) k = k.groupby("rsid").first().reset_index() features_metadata = features_metadata[features_metadata.id.isin(k.id)] logging.info("Kept %d", features_metadata.shape[0]) else: logging.info("rsids are unique, no need to restrict to highest frequency entry") if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info("Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"] SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols available_individuals = check_missing(args, data, features) with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i,data_annotation_ in enumerate(data_annotation.itertuples()): if args.MAX_M and i>=args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) logging.info("Finished")
from bokeh.embed import server_document from bokeh.transform import factor_cmap ################################################################################# # This just loads in the data... # Alot of this was built of this "cross-fire demo" # https://github.com/bokeh/bokeh/blob/branch-2.3/examples/app/crossfilter/main.py start_date = dt.datetime(2017,7,1) end_date = dt.datetime(2022,3,1) background = "#ffffff" file = "./data"+ "/data.parquet" df = pq.read_table(file).to_pandas() df.sort_index(inplace=True) options = df.index.unique(0).to_list() #print(options) product = "HS CODE 72, IRON AND STEEL" level = "US Dollars" ################################################################################# #These are functions used in the plot... def growth_trade(foo):
import sys import os import pandas as pd import pyarrow.parquet as pq import pyarrow.orc as orc ''' parquet_file = pq.ParquetFile(sys.argv[1]) print(parquet_file.schema) ''' orc_name = os.path.splitext(sys.argv[1])[0] + ".orc" table = pq.read_table(sys.argv[1]) print("Writing ", orc_name) orc.write_table(table, orc_name)
def _load(self): # might not be optimal, but it works, we can always see if we can # do mmapping later on table = pq.read_table(self.path) self._load_table(table)
def parquet_read_table(op, client, scope, **kwargs): path = client.dictionary[op.name] table = pq.read_table(str(path)) df = table.to_pandas() return df
def from_file(path: str, file_format: str = None, name: str = None, perform_gzip: bool = True, dtype: dict = None): """ - File is read in with pyarrow, converted to bytes, compressed by default, and stored as a SQLite blob field. - Note: If you do not remove your file's index columns before importing them, then they will be included in your Dataset. The ordered nature of this column represents potential bias during analysis. You can drop these and other columns in memory when creating a Featureset from your Dataset. - Note: If no column names are provided, then they will be inserted automatically. - `path`: Local or absolute path - `file_format`: Accepts uncompressed formats including parquet, csv, and tsv (a csv with `delimiter='\t'`). This tag is used to tell pyarrow how to handle the file. We do not infer the path because (a) we don't want to force file extensions, (b) we want to make sure users know what file formats we support. - `name`: if none specified, then `path` string will be used. - `perform_gzip`: Whether or not to perform gzip compression on the file. We have observed up to 90% compression rates during testing. """ # create some files with no column names # do some testing with sparse null column names... # do some testing with all null column names... accepted_formats = ['csv', 'tsv', 'parquet'] if file_format not in accepted_formats: print( "Error - Accepted file formats include uncompressed csv, tsv, and parquet." ) else: # Defaults. if name is None: name = path if perform_gzip is None: perform_gzip = True #ToDo prevent ff combos like '.csv' with 'parquet' vice versa. # File formats. if (file_format == 'tsv') or (file_format is None): parse_opt = pc.ParseOptions(delimiter='\t') tbl = pc.read_csv(path, parse_options=parse_opt) file_format = 'tsv' elif (file_format == 'csv'): parse_opt = pc.ParseOptions(delimiter=',') tbl = pc.read_csv(path) elif (file_format == 'parquet'): tbl = pq.read_table(path) #ToDo - handle columns with no name. columns = tbl.column_names with open(path, "rb") as f: bytesio = io.BytesIO(f.read()) data = bytesio.getvalue() if perform_gzip: data = gzip.compress(data) is_compressed = True else: is_compressed = False d = Dataset.create(name=name, data=data, dtype=dtype, file_format=file_format, is_compressed=is_compressed, columns=columns) return d
def read(self, file_, extension): return parquet.read_table(file_).to_pandas()
path_processed_data = os.path.join(path_source_data, "ProcessedData") else: raise FileNotFoundError( "Define the path_working, path_source_data, gtfs_dir, \ ZippedFilesloc, and path_processed_data in a new elif block" ) # User-Defined Package import wmatarawnav as wr # Globals AnalysisRoutes = ['79'] ZipParentFolderName = "October 2019 Rawnav" #1 Analyze Route ---Subset RawNav data. ######################################################################################## FinDat = pq.read_table(source=os.path.join( path_processed_data, "Route79_Partition.parquet")).to_pandas() FinDat.route = FinDat.route.astype('str') FinDat.drop( columns=["SatCnt", 'Blank', 'LatRaw', 'LongRaw', '__index_level_0__'], inplace=True) #Check for duplicate IndexLoc assert (FinDat.groupby(['filename', 'IndexTripStartInCleanData', 'IndexLoc'])['IndexLoc'].count().values.max() == 1) FinDat.loc[:, "Count"] = FinDat.groupby( ['filename', 'IndexTripStartInCleanData', 'IndexLoc'])['IndexLoc'].transform("count") FinDatCheck = FinDat[ FinDat.Count > 1] # Check what is happening with the file here :'rawnav06435191012.txt' on Friday. FinDatCheck.filename.unique()
def read(self, path, extension): with path.open('rb') as file_: return parquet.read_table(file_).to_pandas()
import pyarrow.parquet as pq import numpy as np import pandas as pd import pyarrow as pa df = pd.read_csv( '/Users/sbommireddy/Downloads/business-price-indexes-september-2019-quarter-csv.csv' ) table = pa.Table.from_pandas(df, preserve_index=False) #write parquet file pq.write_table(table, 'business-price-indexes-september-2019-quarter.parquet') t = pq.read_table('business-price-indexes-september-2019-quarter.parquet') print(t.to_pandas()) parquet_file1 = pq.ParquetFile( 'business-price-indexes-september-2019-quarter.parquet') print("Print metadata") print(parquet_file1.metadata)
# 2.1. Read-in Data ################### # Reduce rawnav data to runs present in the summary file after filtering. xwalk_seg_pattern_stop_fil = xwalk_seg_pattern_stop.query( 'seg_name_id == @seg') seg_routes = list(xwalk_seg_pattern_stop_fil.route.drop_duplicates()) rawnav_dat = (wr.read_cleaned_rawnav( analysis_routes_=seg_routes, path=os.path.join(path_processed_data, "rawnav_data.parquet")).drop( columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])) segment_summary = (pq.read_table(source=os.path.join( path_processed_data, "segment_summary_2017_test.parquet"), filters=[['seg_name_id', "=", seg]], use_pandas_metadata=True).to_pandas()) segment_summary_fil = (segment_summary.query('~(flag_too_far_any\ | flag_wrong_order_any\ | flag_too_long_odom\ | flag_secs_total_mismatch\ | flag_odom_total_mismatch)')) stop_index = ( pq.read_table(source=os.path.join(path_processed_data, "stop_index.parquet"), filters=[[('route', '=', route)] for route in seg_routes], columns=[ 'seg_name_id', 'route', 'pattern', 'stop_id',
def prepData(s_data): train_x = [] test_x = [] train_y = [] test_y = [] p_flg = 0 flg = 0 totalcnt = 0 df = pq.read_table(s_data).to_pandas() df = df[['signal', 'originalsize']] samplecnt = 0 for path in paths: table = pq.read_table(path) df = table.to_pandas() # print(df) df = df[['signal', 'originalsize']] for idx, row in df.iterrows(): flg = samplecnt signal = np.array(list(row[0])) signal = zeropadding10(signal) signal = np.array(signal) signal = signal.astype('float32') / 255. originalsize = np.array(extendAry(row[1])) originalsize = zeropadding10(originalsize) if cnt == 0: print(path.replace(s_data + "/", "")) fmer = path.replace(s_data + "/", "").replace(".pq", "") plt.title(fmer) plt.plot(signal) # plt.show() # fig = plt.figure(fmer) plt.savefig("/groups2/gac50430/nanopore/dataset4DL/figs/" + fmer + ".png") plt.clf() # plt.plot(originalsize) # plt.show() testidx = (idx % 12 >= 10) if testidx: test_x.append(signal) test_x.append(originalsize) test_y.append(flg) else: train_x.append(signal) train_x.append(originalsize) train_y.append(flg) cnt = cnt + 1 totalcnt = totalcnt + 1 if cnt % 12000 == 0: print(samplecnt, totalcnt, path, totalcnt, idx, row) if cnt == 36000: break samplecnt = samplecnt + 1 print("totalcnt", totalcnt) train_x = np.array(train_x) test_x = np.array(test_x) train_y = np.array(train_y) test_y = np.array(test_y) num_classes = np.unique(train_y).size print("train_x.shape", train_x.shape) print("test_x.shape", train_x.shape) print("train_y.shape", train_x.shape) print("test_y.shape", train_x.shape) print(num_classes, 'classes') print('y_train shape:', train_y.shape) print('y_test shape:', test_y.shape) train_x = np.reshape(train_x, (-1, DATA_LENGTH, 2)) test_x = np.reshape(test_x, (-1, DATA_LENGTH, 2)) train_y = np.reshape(train_y, (-1, 1,)) test_y = np.reshape(test_y, (-1, 1,)) test_y = test_y - 1 train_y = train_y - 1 train_y = keras.utils.to_categorical(train_y, num_classes) test_y = keras.utils.to_categorical(test_y, num_classes) print('train_x:', train_x.shape) print('train_y:', train_y.shape) print('test_x shape:', test_x.shape) print('test_y shape:', test_y.shape) return train_x, test_x, train_y, test_y, num_classes
def test_impl(): df = pq.read_table("groupby3.pq").to_pandas() A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min()) return A.sum()
def test_read_non_existent_file(tempdir, use_legacy_dataset): path = 'non-existent-file.parquet' try: pq.read_table(path, use_legacy_dataset=use_legacy_dataset) except Exception as e: assert path in e.args[0]
import pyarrow.parquet as pq import pandas as pd table = pq.read_table('/root/Downloads/BGU_PROJECT/data.parquet') table.to_pandas() from keras import regularizers from keras.layers.core import Dropout from keras.optimizers import Adam from keras.models import Model from keras.layers import Input, Embedding, Dense, LSTM from string import printable from keras.preprocessing import sequence from python_utils import save_model, load_model from keras.utils.vis_utils import plot_model from keras.callbacks import CSVLogger import pyarrow.parquet as pq import pandas as pd table = pq.read_table('/root/Downloads/BGU_PROJECT/data.parquet') table.to_pandas() class LSTMC: def __init__(self, max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, w_reg=regularizers.l2(1e-4)): super().__init__() self.max_len = max_len self.csv_logger = CSVLogger('table', append=True, separator=';')
# coding: utf-8 ''' CREATE TABLE user_pq ( id bigint NOT NULL DEFAULT 0, name varchar, age int ) with (appendonly=true, orientation=parquet); insert into user_pq(id,name,age) values (generate_series(1,1000), 'interma', trunc(random() * 99 + 1)); hawq extract -d postgres -o user_pq.yml test_sa.user_pq ''' import pyarrow.parquet as pq t = pq.read_table('user_pq.parquet') t = pq.read_table('./user_pq.parquet') t = pq.read_table('./user_pq.parquetxx') t = pq.read_table('./user_pq.parquet') t = pq.read_table('./user_pq.parquet') t pq.ParquetFile('user_pq.parquet') f = pq.ParquetFile('user_pq.parquet') f f.metadarta f.metadata f.schema f..read_row_group(0) f.read_row_group(0) f.read_row_group(1) f.read_row_group(0) help help()
def test_export_parquet(tmpdir_factory): """Test export of DataFrame to parquet""" Settings.tidy = False Settings.humanize = True Settings.si_units = False # Request data. request = DwdObservationRequest( parameter=DwdObservationDataset.CLIMATE_SUMMARY, resolution=DwdObservationResolution.DAILY, start_date="2019", end_date="2020", ).filter_by_station_id( station_id=[1048], ) df = request.values.all().df # Save to Parquet file. filename = tmpdir_factory.mktemp("data").join("observations.parquet") ExportMixin(df=df).to_target(f"file://{filename}") # Read back Parquet file. table = pq.read_table(filename) # Validate dimensions. assert table.num_columns == 19 assert table.num_rows == 366 # Validate column names. assert table.column_names == [ "station_id", "dataset", "date", "qn_3", "wind_gust_max", "wind_speed", "qn_4", "precipitation_height", "precipitation_form", "sunshine_duration", "snow_depth", "cloud_cover_total", "pressure_vapor", "pressure_air_site", "temperature_air_mean_200", "humidity", "temperature_air_max_200", "temperature_air_min_200", "temperature_air_min_005", ] # Validate content. data = table.to_pydict() assert data["date"][0] == datetime.datetime(2019, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) assert data["temperature_air_min_005"][0] == 1.5 assert data["date"][-1] == datetime.datetime(2020, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) assert data["temperature_air_min_005"][-1] == -4.6 os.unlink(filename)
def df_to_parquet_table(df: pd.DataFrame) -> pa.Table: """ Convert DataFrame to Pyarrow Table Example: pyarrow.Table MGLT: string cargo_capacity: string consumables: string cost_in_credits: string created: string crew: string edited: string films: string hyperdrive_rating: string length: string manufacturer: string max_atmosphering_speed: string model: string name: string passengers: string pilots: double starship_class: string url: string __index_level_0__: int64 metadata -------- {b'pandas': b'{"columns": [{"field_name": "MGLT", "pandas_type": "unicode", "m' b'etadata": null, "name": "MGLT", "numpy_type": "object"}, {"field' b'_name": "cargo_capacity", "pandas_type": "unicode", "metadata": ' b'null, "name": "cargo_capacity", "numpy_type": "object"}, {"field' b'_name": "consumables", "pandas_type": "unicode", "metadata": nul' b'l, "name": "consumables", "numpy_type": "object"}, {"field_name"' b': "cost_in_credits", "pandas_type": "unicode", "metadata": null,' b' "name": "cost_in_credits", "numpy_type": "object"}, {"field_nam' b'e": "created", "pandas_type": "unicode", "metadata": null, "name' b'": "created", "numpy_type": "object"}, {"field_name": "crew", "p' b'andas_type": "unicode", "metadata": null, "name": "crew", "numpy' b'_type": "object"}, {"field_name": "edited", "pandas_type": "unic' b'ode", "metadata": null, "name": "edited", "numpy_type": "object"' b'}, {"field_name": "films", "pandas_type": "unicode", "metadata":' b' null, "name": "films", "numpy_type": "object"}, {"field_name": ' b'"hyperdrive_rating", "pandas_type": "unicode", "metadata": null,' b' "name": "hyperdrive_rating", "numpy_type": "object"}, {"field_n' b'ame": "length", "pandas_type": "unicode", "metadata": null, "nam' b'e": "length", "numpy_type": "object"}, {"field_name": "manufactu' b'rer", "pandas_type": "unicode", "metadata": null, "name": "manuf' b'acturer", "numpy_type": "object"}, {"field_name": "max_atmospher' b'ing_speed", "pandas_type": "unicode", "metadata": null, "name": ' b'"max_atmosphering_speed", "numpy_type": "object"}, {"field_name"' b': "model", "pandas_type": "unicode", "metadata": null, "name": "' b'model", "numpy_type": "object"}, {"field_name": "name", "pandas_' b'type": "unicode", "metadata": null, "name": "name", "numpy_type"' b': "object"}, {"field_name": "passengers", "pandas_type": "unicod' b'e", "metadata": null, "name": "passengers", "numpy_type": "objec' b't"}, {"field_name": "pilots", "pandas_type": "float64", "metadat' b'a": null, "name": "pilots", "numpy_type": "float64"}, {"field_na' b'me": "starship_class", "pandas_type": "unicode", "metadata": nul' b'l, "name": "starship_class", "numpy_type": "object"}, {"field_na' b'me": "url", "pandas_type": "unicode", "metadata": null, "name": ' b'"url", "numpy_type": "object"}, {"field_name": "__index_level_0_' b'_", "pandas_type": "int64", "metadata": null, "name": null, "num' b'py_type": "int64"}], "column_indexes": [{"field_name": null, "pa' b'ndas_type": "unicode", "metadata": {"encoding": "UTF-8"}, "name"' b': null, "numpy_type": "object"}], "pandas_version": "0.22.0", "i' b'ndex_columns": ["__index_level_0__"]}'} """ pyarrow_deathstar_table = pa.Table.from_pandas(df) # Create PyArrow Table from Pandas DF print(pyarrow_deathstar_table) pq.write_table(pyarrow_deathstar_table, 'deathstar.parquet') # Convert PyArrow Table to Parquet Table / File parquet_table = pq.read_table('deathstar.parquet') # Read back Parquet File as a Table parquet_table = pq.ParquetFile('deathstar.parquet') # Read back Parquet File as a ParquetFile for finer-grained read and write print(parquet_table.metadata) #<pyarrow._parquet.FileMetaData object at 0x7fb755c29458> # created_by: parquet-cpp version 1.4.1-SNAPSHOT # num_columns: 19 # num_rows: 1 # num_row_groups: 1 # format_version: 1.0 # serialized_size: 4574 print(parquet_table.schema) #<pyarrow._parquet.ParquetSchema object at 0x7efc80565408> #MGLT: BYTE_ARRAY UTF8 #cargo_capacity: BYTE_ARRAY UTF8 #consumables: BYTE_ARRAY UTF8 #cost_in_credits: BYTE_ARRAY UTF8 #created: BYTE_ARRAY UTF8 #crew: BYTE_ARRAY UTF8 #edited: BYTE_ARRAY UTF8 #films: BYTE_ARRAY UTF8 #hyperdrive_rating: BYTE_ARRAY UTF8 #length: BYTE_ARRAY UTF8 #manufacturer: BYTE_ARRAY UTF8 #max_atmosphering_speed: BYTE_ARRAY UTF8 #model: BYTE_ARRAY UTF8 #name: BYTE_ARRAY UTF8 #passengers: BYTE_ARRAY UTF8 #pilots: DOUBLE #starship_class: BYTE_ARRAY UTF8 #url: BYTE_ARRAY UTF8 #__index_level_0__: INT64 return parquet_table
exit() # %% Convert Stata file to parquet # if False: # # %% Initial data load # logger.info('Reading Stata File') # data = pd.read_stata( # '/Users/janschaefer/Dropbox/10_Thesis/02_Data/save_step4.dta') # # # %% writing parquet # logger.info('Writing parquet to disk.') # pq.write_table(pa.Table.from_pandas(data), dataPath+'/data.parquet') # %% Initial data load logger.info("Reading Parquet File") data = pq.read_table(dataPath + "/data.parquet").to_pandas() # %% Data Summary # logger.info('Data with types:\n%s', data.dtypes) logger.info("Dimensions of data: %s", data.shape) # data = data.head(1000) # %% create needTranslation subset file = open(filePath + "/iterate.txt", "r") iterate = file.read() file.close() file = open("iterate.bak", "w") file.write(str(iterate)) file.close()
def _read_table(*args, **kwargs): import pyarrow.parquet as pq table = pq.read_table(*args, **kwargs) table.validate(full=True) return table
def read_parquet(filename): print(f'Loading {filename}') table = pq.read_table(filename) return table.to_pandas()
pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms') if __name__ == '__main__': schema = pa.schema([ pa.field('name', pa.string()), pa.field('labels', pa.list_(pa.string())), pa.field('created', pa.timestamp('ms')), pa.field('valid', pa.bool_()), pa.field('status', pa.int64()), ]) data = [ { 'name': 'a', 'labels': ['A', 'B'], 'created': int(28800000 + 1000 * time.mktime(datetime.datetime(2018, 8, 1).date().timetuple())), 'valid': True, 'status': 1, }, { 'name': 'b', 'labels': ['B', 'C'], 'created': int(28800000 + 1000 * time.mktime(datetime.datetime(2018, 8, 2).date().timetuple())), 'valid': False, 'status': 2, }, ] json_to_parquet(data, 'a', schema) table2 = pq.read_table('a') print table2.to_pandas()
def _read_table(*args, **kwargs): import pyarrow.parquet as pq return pq.read_table(*args, **kwargs)
from flask import Flask, render_template, flash, request from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField import pandas as pd import pyarrow.parquet as pq import json # app configuration DEBUG = True app = Flask(__name__) app.config.from_object(__name__) app.config['SECRET_KEY'] = '5i11yg00s3' PQPATH = '/Users/lxu213/data/ad-free-search-engine/spark-warehouse/updated_adwords/tf_idf.parquet' data = pq.read_table(PQPATH, nthreads=4).to_pandas() class SearchBox(Form): query = TextField(validators=[validators.required()]) @app.route("/", methods=['GET', 'POST']) def hello(): form = SearchBox(request.form) print form.errors query = '' kw_dict = [] if request.method == 'POST': query = request.form['query'] kw_data = data[['val', 'tf-idf']].loc[data['keywords'].isin( query.lower().split())][:50]
model = Ridge(fit_intercept=True) X = x[:, np.newaxis] model.fit(X, y) if (mute == False): print( f"\t coef_: {model.coef_[0]:.05f}, int_: {model.intercept_:.05f}") return model if __name__ == '__main__': # Use ridge regression method to estimate the # slope of the cloud size distribution df = pq.read_table(f'tracking/clouds_{120:08d}.pq', nthreads=6).to_pandas() df_size = get_cloud_area(df) model = calc_cloud_slope(df_size) hist, bin_edges = np.histogram(df_size['area'], bins='fd') m_ = (hist > 0) x, y = np.log10(bin_edges[1:][m_]), np.log10(hist[m_]) #---- Plotting fig = plt.figure(1, figsize=(3, 3)) fig.clf() sns.set_context('paper') sns.set_style( 'ticks', { 'axes.grid': False,
# -*- coding: utf-8 -*- """ Created on Tue May 19 03:18:19 2020 @author: WylieTimmerman """ import pandas as pd, os, numpy as np, pyproj, sys, zipfile, glob, logging import pyarrow as pa import pyarrow.parquet as pq path_sp = r"C:\OD\Foursquare ITP\Foursquare ITP SharePoint Site - Shared Documents\WMATA Queue Jump Analysis" path_processed_data = os.path.join(path_sp, r"Client Shared Folder\data\02-processed") path_interim_data = os.path.join(path_sp, r"Client Shared Folder\data\01-interim") FinDat = pq.read_table(source=os.path.join( path_interim_data, "Route79_Partition_20200519.parquet")).to_pandas() FinDat.to_csv(os.path.join(path_interim_data, "Route79_20200519.csv"))