def patable(self): for file in os.listdir(): if file.endswith(".csv"): file1 = file df = pd.read_csv(os.path.abspath(file1)) #select required columns and enrich data with partition attributes df = df[[ 'ForecastSiteCode', 'ObservationTime', 'ObservationDate', 'ScreenTemperature', 'SiteName', 'Region' ]] df = df.sort_values(by=[ 'ForecastSiteCode', 'ObservationDate', 'ObservationTime' ]) df = df.reset_index(drop=True) df['ObsYear'] = pd.DatetimeIndex(df['ObservationDate']).year df['ObsMonth'] = pd.DatetimeIndex(df['ObservationDate']).month df['ObsDay'] = pd.DatetimeIndex(df['ObservationDate']).day table = pa.Table.from_pandas(df) #create additional files for testing file1 = file1.replace(".csv", ".") file2 = file1 + 'parquet.snappy' pq.write_table(table, file2, compression='snappy') pq.write_to_dataset( table, root_path='weather_results', partition_cols=['ObsYear', 'ObsMonth', 'Region'])
def _chunk_readwrite(archive_url, dest_path, chunksize, header, encoding, dtype, dataset): """stream read and write archives pandas reads and parquet writes notes ----- * dest_path can be either a file.parquet, or in hte case of partitioned parquet it will be only the destination folder of the parquet partition files """ pqwriter = None header = [] for i, df in enumerate( pd.read_csv(archive_url, chunksize=chunksize, names=header, encoding=encoding, dtype=dtype)): table = pa.Table.from_pandas(df) if i == 0: if dataset: header = np.copy(table.schema) else: pqwriter = pq.ParquetWriter(dest_path, table.schema) if dataset: pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols) else: pqwriter.write_table(table) if pqwriter: pqwriter.close() return header
def tokenize(partition): partition_name = "{}-{}-{}".format(partition["tw_year"].iloc[0], partition["tw_month"].iloc[0], partition["tw_day"].iloc[0]) start = timer() print("Begining Tokenization: {}".format(partition_name)) tokenizer = CrazyTokenizer(extra_patterns=PATTERNS, lowercase=True, normalize=3, ignore_quotes=False, ignore_stopwords=True, stem="lemm", remove_punct=True, remove_numbers=True, remove_breaks=True, decontract=True, hashtags="split", twitter_handles='', urls=False) partition["tokens"] = partition["full_text"].apply(tokenizer.tokenize) table = pa.Table.from_pandas(partition) pq.write_to_dataset(table, root_path=OUTPUT_DIR, partition_cols=['tw_year', 'tw_month', 'tw_day']) end = timer() print("Tokenization Finished for {}. Took {} seconds.".format( partition_name, end - start))
def write_pq( file_path, dataf, partition_cols=None, flavor='spark', filesystem=None, append=False, log=log, ): "Write to Parquet, python3 compatible. 'data' must be list of interables" s_t = now() if not append and os.path.exists(file_path): shutil.rmtree(file_path, ignore_errors=True) table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count()) counter = table.num_rows pq.write_to_dataset( table, root_path=file_path, partition_cols=partition_cols, flavor=flavor, preserve_index=False, filesystem=filesystem, use_deprecated_int96_timestamps=True, compression='snappy') # will append. delete folder for overwrite secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate)) return counter
def _make_parquet_file( filename, nrows=NROWS, ncols=2, force=True, directory=False, partitioned_columns=[], ): """Helper function to generate parquet files/directories. Args: filename: The name of test file, that should be created. nrows: Number of rows for the dataframe. ncols: Number of cols for the dataframe. force: Create a new file/directory even if one already exists. directory: Create a partitioned directory using pyarrow. partitioned_columns: Create a partitioned directory using pandas. Will be ignored if directory=True. """ if force or not os.path.exists(filename): df = pandas.DataFrame( {f"col{x + 1}": np.arange(nrows) for x in range(ncols)}) if directory: if os.path.exists(filename): shutil.rmtree(filename) else: os.makedirs(filename) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=filename) elif len(partitioned_columns) > 0: df.to_parquet(filename, partition_cols=partitioned_columns) else: df.to_parquet(filename) filenames.append(filename)
def parse(xml_dir, output, fs, seed): xml_dir = xml_dir.rstrip('/') # remove trailing / output = output.rstrip('/') os.makedirs(output, exist_ok=True) # create output directory if it does not exist if seed is not None: random.seed(int(seed)) npr.seed(int(seed)) parser = CtakesXmlParser() files = get_files(fs, xml_dir) parsed = [] for f in files: xml_result = parser.parse(f) parsed.append(xml_result) def filenamer(x): print(x) try: return '-'.join(x) + '.parquet' except TypeError: return str(x) + '.parquet' for p in parsed: for key, val in p.items(): feature_df = pd.DataFrame(list(val)) if feature_df.shape[0] > 0: table = pa.Table.from_pandas(feature_df) #pq.write_to_dataset(table, output + f'/{key}', partition_filename_cb=filenamer, #pq.write_to_dataset(table, output + f'/{key}', partition_filename_cb=lambda x:'-'.join(x)+'.parquet', pq.write_to_dataset(table, output + f'/{key}', filesystem=None)
def df2parquet(self, pandasDF, bucket: str, folder: str, file: str, overwrite: bool = False, engine: str = 'auto', compression: str = 'snappy', use_dictionary: bool = False, coerce_timestamps: str = 'ms', partition_cols: list = None, row_group_size: int = None, **kwargs): s3Path = "s3://%s/%s/%s" % (bucket, folder, file) if folder != None \ else "s3://%s/%s" % (bucket, file) pq.write_to_dataset( table=pa.Table.from_pandas(pandasDF), root_path=s3Path, partition_cols=partition_cols, filesystem=self._s3fs, preserve_index=False, compression=compression, flavor='spark', #Enable Spark compatibility coerce_timestamps= coerce_timestamps, #Limit the timestamp to miliseconds allow_truncated_timestamps= True, #Don't raise exception during truncation use_dictionary=use_dictionary, version='2.0')
def move_labels_to_datalake(label_files, wikis): fs = pa.hdfs.connect(host='an-coord1001.eqiad.wmnet', port=10000) fs = fs.connect() parquet_path = "/user/nathante/ores_bias_data/ores_label_editors" if fs.exists(parquet_path): fs.rm(parquet_path, recursive=True) out_schema = [ 'wiki', 'ns', 'pageid', 'title', 'revid', 'parentid', 'user', 'userid' ] print("collecting userids") for label_file, context in zip(label_files, wikis): if label_file is not None: labels = load_labels(label_file) rows = get_editor_traits(labels, context, out_schema) pddf = pd.DataFrame(rows) pddf.to_pickle("ores_label_editors.pickle") out_table = pa.Table.from_pandas(pddf) pq.write_to_dataset(out_table, root_path=parquet_path, partition_cols=['wiki'], filesystem=fs, flavor='spark') print("pushed labels for {0}".format(context))
def one_off_update(product='etfs'): start = datetime.date(2010, 1, 1) end = datetime.date(2018, 11, 29) names = _meta[product] base = os.path.join('raw/yahoo/') if not os.path.exists(base): os.makedirs(base) # filenames = glob.glob(os.path.join(base, '*.parquet')) # TODO: check dates on existing for the update filename = os.path.join(base, '{}_to_{}'.format(start, end)) filename_check = os.path.join( filename, 'product={}'.format(product)) # this is terrible if os.path.exists(filename_check): print("{} exists".format(filename_check)) return print('getting {} names'.format(len(names))) df = get_data_yahoo(names, start, end) df['product'] = product table = pa.Table.from_pandas(df, preserve_index=False) # partitioning by name is less efficient storage wise but makes for better joins in the next step pq.write_to_dataset(table, root_path=filename, partition_cols=['product', 'name'], preserve_index=False) return df
def write_data(self, data): cdir = "{}/{}/".format(self.root_output_dir, data["topic"]) if not os.path.isdir(cdir): os.makedirs(cdir) # dtypes = {x: data['schema'].field(x).type.__str__() # if 'list' not in data['schema'].field(x).type.__str__() # else data['schema'].field(x).type.to_pandas_dtype() # for x in data['schema'].names} df = pd.DataFrame.from_dict(data["records"]) # df.to_parquet( # path=cdir, # partition_cols=data['partition_cols'], # index=True, # engine='pyarrow') # pq.write_metadata( # self.schema,'{}/_metadata'.format(cdir), # version='2.0', # coerce_timestamps='us') table = pa.Table.from_pandas(df, schema=data["schema"], preserve_index=False) pq.write_to_dataset( table, root_path=cdir, partition_cols=data['partition_cols'], version="2.0", compression='ZSTD', row_group_size=100000, )
def _make_parquet_file( filename, row_size=NROWS, force=True, directory=False, partitioned_columns=[], ): """Helper function to generate parquet files/directories. Args: filename: The name of test file, that should be created. row_size: Number of rows for the dataframe. force: Create a new file/directory even if one already exists. directory: Create a partitioned directory using pyarrow. partitioned_columns: Create a partitioned directory using pandas. Will be ignored if directory=True. """ df = pandas.DataFrame( {"col1": np.arange(row_size), "col2": np.arange(row_size)} ) if os.path.exists(filename) and not force: pass elif directory: if os.path.exists(filename): shutil.rmtree(filename) else: os.mkdir(filename) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=filename) elif len(partitioned_columns) > 0: df.to_parquet(filename, partition_cols=partitioned_columns) else: df.to_parquet(filename) filenames.append(filename)
def _send_to_s3(self, force=False): """Copy in-memory batches to s3""" for table_name, batches in self._batches.items(): if not force and len(batches) <= CACHE_SIZE: continue if table_name == SITE_VISITS_INDEX: out_str = '\n'.join([json.dumps(x) for x in batches]) if not isinstance(out_str, six.binary_type): out_str = out_str.encode('utf-8') fname = '%s/site_index/instance-%s-%s.json.gz' % ( self.dir, self._instance_id, hashlib.md5(out_str).hexdigest()) self._write_str_to_s3(out_str, fname) else: try: table = pa.Table.from_batches(batches) pq.write_to_dataset(table, self._s3_bucket_uri % table_name, filesystem=self._fs, preserve_index=False, partition_cols=['instance_id'], compression='snappy', flavor='spark') except pa.lib.ArrowInvalid as e: self.logger.error( "Error while sending record:\n%s\n%s\n%s\n" % (table_name, type(e), e)) pass self._batches[table_name] = list()
def write_file(self, stream_name:str, data:DataStream.data, file_mode:str) -> bool: """ Write pyspark DataFrame to a file storage system Args: stream_name (str): name of the stream data (object): pyspark DataFrame object file_mode (str): write mode, append is currently supportes Returns: bool: True if data is stored successfully or throws an Exception. Raises: Exception: if DataFrame write operation fails """ data_path = self._get_storage_path(stream_name=stream_name) if isinstance(data, pd.DataFrame): try: table = pa.Table.from_pandas(data, preserve_index=False) pq.write_to_dataset(table, root_path=data_path, partition_cols=["version", "user"]) return True except Exception as e: raise Exception("Cannot store pandas dataframe: "+str(e)) else: try: data.write.partitionBy(["version","user"]).format('parquet').mode(file_mode).save(data_path) return True except Exception as e: raise Exception("Cannot store spark dataframe: "+str(e))
def execute(cls, ctx, op): df = ctx[op.input.key] out = op.outputs[0] i = op.outputs[0].index[0] path = op.path has_wildcard = False if '*' in path: path = path.replace('*', str(i)) has_wildcard = True if op.partition_cols is None: if not has_wildcard: fs = get_fs(path, op.storage_options) path = fs.pathsep.join([path.rstrip(fs.pathsep), f'{i}.parquet']) if op.engine == 'fastparquet': df.to_parquet(path, engine=op.engine, compression=op.compression, index=op.index, open_with=open_file, **op.additional_kwargs) else: with open_file(path, mode='wb', storage_options=op.storage_options) as f: df.to_parquet(f, engine=op.engine, compression=op.compression, index=op.index, **op.additional_kwargs or dict()) else: if op.engine == 'pyarrow': pq.write_to_dataset(pa.Table.from_pandas(df), path, partition_cols=op.partition_cols) else: # pragma: no cover raise NotImplementedError('Only support pyarrow engine when ' 'specify `partition_cols`.') ctx[out.key] = pd.DataFrame()
async def normalization() -> None: """ Waits until Record is available, dump record to parquet format, then checkpoint last successful offset This function runs as a separate task in the asyncio event loop :return: None """ while True: record: Record = await app.data_queue.get() df: DataFrame = pandas.DataFrame.from_records(record.payload) df: DataFrame = pandas_transform(df=df) table = pyarrow.Table.from_pandas(df) parquet.write_to_dataset( table=table, root_path=str(app.parquet_path.joinpath('restaurant_inspections')), compression='snappy', partition_cols=['geohash', 'year', 'month', 'day']) await set_current_offset(offset=record.checkpoint_offset)
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None): milliseconds_since_epoch = int(time() * 1000) print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) data = pq.read_table(source=input_file, filters=filters) # Write a dataset and collect metadata information of all written files metadata_collector = [] root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch) pq.write_to_dataset(data, root_path=root_path, partition_cols=['start_year'], metadata_collector=metadata_collector) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(data.schema, root_path + '/_common_metadata') # Write the ``_metadata`` parquet file with row groups statistics of all files # Gives following error: # File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups # RuntimeError: AppendRowGroups requires equal schemas. # data.schema has one more column than partitioned files when partitioning by one column # Related? https://github.com/dask/dask/issues/6243 # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector) # Read from partitioned dataset # use the new generic Dataset API start_year = 2018 value = 50000 table = pq.read_table(root_path, filters=[('start_year', '>=', start_year), ('value', '>', value)]) # filters=[('start_year', '>=', start_year)]) print(table.to_pandas())
def _write_partition_pyarrow( df, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs ): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) if partition_on: parquet.write_to_dataset( t, path, partition_cols=partition_on, preserve_index=write_index, filesystem=fs, **kwargs ) else: with fs.open(filename, "wb") as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with fs.open(metadata_path, "wb") as fil: # Get only arguments specified in the function kwargs_meta = { k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs } parquet.write_metadata(t.schema, fil, **kwargs_meta)
def _test_write_to_dataset_with_partitions(base_path, filesystem=None): # ARROW-1400 import pyarrow.parquet as pq output_df = pd.DataFrame({ 'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), 'num': list(range(10)), 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]') }) cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, base_path, partition_by, filesystem=filesystem) input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read() input_df = input_table.to_pandas() # Read data back in and compare with original DataFrame # Partitioned columns added to the end of the DataFrame when read input_df_cols = input_df.columns.tolist() assert partition_by == input_df_cols[-1 * len(partition_by):] # Partitioned columns become 'categorical' dtypes input_df = input_df[cols] for col in partition_by: output_df[col] = output_df[col].astype('category') assert output_df.equals(input_df)
def download_update_one_from_yahoo(product, name, start_date=None, end_date=None): # example tan, if start_date is None use last avail date in the raw data output_dirname = download_update_target() filename = os.path.join if start_date is None: missing_back_dates, start_date = get_missing_dates(product, name) # TODO: worrying about exact timing if start_date >= (datetime.date.today() - datetime.timedelta(days=1)): print("{} {} has data up to but not including {}".format( product, name, start_date)) return df = get_data_yahoo([name], start=start_date, end=end_date) if missing_back_dates: raise Exception('nip') print('writing {}'.format(output_dirname)) df['product'] = product # TODO: better ways to do this but this is safer df['name'] = name # TODO: better ways to do this but this is safer df = df.reset_index() df.columns = [x.lower() for x in df.columns] table = pa.Table.from_pandas(df, preserve_index=False) # this CAN lead to duplicate entries so deduping is necessary, idea is that this is basically append-only like pq.write_to_dataset(table, root_path=output_dirname, partition_cols=['product', 'name'], preserve_index=False)
def save_training_data(dataframe, path): """ Convert dataframe into pyarrow table and save it on s3 """ s3 = S3FileSystem() table = pa.Table.from_pandas(dataframe) print(f"Saving for machine learning team on {path}") pq.write_to_dataset(table, root_path=path, filesystem=s3) print("OK")
def _flush(self) -> None: """ Intermediate data flush that's triggered during the buffering operation. Uploads data stored in memory to the S3. """ for table, data in self._buffer.items(): key_list, ts_list, payload = zip(*data) upload_data = [ pa.array(key_list), pa.array(ts_list), pa.array(payload) ] pa_table = pa.table(upload_data, names=[ "_airbyte_ab_id", "_airbyte_emitted_at", "_airbyte_data" ]) pq.write_to_dataset( table=pa_table, root_path= f"{self.s3_bucket}/airbyte_output/{self.unique_dir}/{table}", filesystem=self.fs) # Update tables self._updated_tables.update(self._buffer.keys()) self._buffer.clear() self._values = 0
def convert_bin_to_parquet(static_path: str) -> None: """ Converts the data from a binary file to a parquet file. Args: static_path: (str) the path to the static file Returns: None """ with ExitStack() as stack: footprint_obj = stack.enter_context(Footprint.load(static_path=static_path, ignore_file_type={'z', 'csv', 'parquet'})) index_data = footprint_obj.footprint_index meta_data = { "num_intensity_bins": footprint_obj.num_intensity_bins, "has_intensity_uncertainty": True if footprint_obj.has_intensity_uncertainty == 1 else False } for event_id in index_data.keys(): data_slice = footprint_obj.get_event(event_id) df = pd.DataFrame(data_slice) df["event_id"] = event_id pq.write_to_dataset( pa.Table.from_pandas(df), root_path=f'{static_path}/footprint.parquet', partition_cols=['event_id'], compression="BROTLI" ) with open(f'{static_path}/footprint_parquet_meta.json', 'w') as outfile: json.dump(meta_data, outfile)
def _send_to_s3(self, force=False): """Copy in-memory batches to s3""" for table_name, batches in self._batches.items(): if not force and len(batches) <= CACHE_SIZE: continue if table_name == SITE_VISITS_INDEX: out_str = '\n'.join([json.dumps(x) for x in batches]) if not isinstance(out_str, six.binary_type): out_str = out_str.encode('utf-8') fname = '%s/site_index/instance-%s-%s.json.gz' % ( self.dir, self._instance_id, hashlib.md5(out_str).hexdigest() ) self._write_str_to_s3(out_str, fname) else: try: table = pa.Table.from_batches(batches) pq.write_to_dataset( table, self._s3_bucket_uri % table_name, filesystem=self._fs, preserve_index=False, partition_cols=['instance_id'], compression='snappy', flavor='spark' ) except pa.lib.ArrowInvalid as e: self.logger.error( "Error while sending record:\n%s\n%s\n%s\n" % (table_name, type(e), e) ) pass self._batches[table_name] = list()
def write_to_s3_parquet(s3_: Union[s3fs.S3FileSystem, None], df: pd.DataFrame = None, path: str = None, partition_cols: List[str] = None): assert df is not None df = pyarrow.Table.from_pandas(df) df = df.drop([c for c in df.column_names if '__index_level_' in c]) print('Writing parquet to {}'.format(path)) pq.write_to_dataset(table=df, root_path=path, filesystem=s3_, partition_cols=partition_cols, preserve_index=False)
def lambda_handler(event, context): for record in event['Records']: input_file_name = record['s3']['object']['key'].replace( INPUT_OBJECT_PATH, '') output_file_name = input_file_name.replace('.csv', '.parquet') # Getting data from the bucket s3 = boto3.client('s3', region_name=BUCKET_REGION) print("Reading data from the following path:") print(INPUT_OBJECT_PATH + input_file_name) bucket_object = s3.get_object( Bucket=BUCKET_NAME, Key=INPUT_OBJECT_PATH + input_file_name) input_body = pd.read_csv(bucket_object['Body']) print("Creating parquet file in the following path: ") print(OUTPUT_OBJECT_PATH ) # Transforming to parquet data_table = pyar.Table.from_pandas(input_body) pypa.write_to_dataset(table=data_table, root_path=OUTPUT_OBJECT_PATH, partition_cols=PARTITION_COLUMNS, filesystem=S3_FILE_SYSTEM) print("Transformation from CSV to parquet finished successfully.") print("New parquet file created in " + OUTPUT_OBJECT_PATH + output_file_name)
def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): # ARROW-8251 - preserve pandas index in roundtrip df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) df.index = pd.Index(['a', 'b', 'c'], name="idx") table = pa.table(df) df_cat = df[["col", "part"]].copy() df_cat["part"] = df_cat["part"].astype("category") pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result, df_cat) pq.write_to_dataset( table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result, df) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table( str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result, df)
def _test_write_to_dataset_no_partitions(base_path, filesystem=None): # ARROW-1400 import pyarrow.parquet as pq output_df = pd.DataFrame({'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), 'num': list(range(10)), 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')}) cols = output_df.columns.tolist() output_table = pa.Table.from_pandas(output_df) if filesystem is None: filesystem = LocalFileSystem.get_instance() # Without partitions, append files to root_path n = 5 for i in range(n): pq.write_to_dataset(output_table, base_path, filesystem=filesystem) output_files = [file for file in filesystem.ls(base_path) if file.endswith(".parquet")] assert len(output_files) == n # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] assert output_df.equals(input_df)
def test_write_to_dataset_pandas_preserve_extensiondtypes( tempdir, use_legacy_dataset ): # ARROW-8251 - preserve pandas extension dtypes in roundtrip if Version(pd.__version__) < Version("1.0.0"): pytest.skip("__arrow_array__ added to pandas in 1.0.0") df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) df['col'] = df['col'].astype("Int64") table = pa.table(df) pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_to_dataset( table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ) result = pq.read_table( str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table( str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]])
def _test_write_to_dataset_with_partitions(base_path, filesystem=None): # ARROW-1400 import pyarrow.parquet as pq output_df = pd.DataFrame({'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), 'num': list(range(10)), 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')}) cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, base_path, partition_by, filesystem=filesystem) input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read() input_df = input_table.to_pandas() # Read data back in and compare with original DataFrame # Partitioned columns added to the end of the DataFrame when read input_df_cols = input_df.columns.tolist() assert partition_by == input_df_cols[-1 * len(partition_by):] # Partitioned columns become 'categorical' dtypes input_df = input_df[cols] for col in partition_by: output_df[col] = output_df[col].astype('category') assert output_df.equals(input_df)
def _make_parquet_file(row_size=SMALL_ROW_SIZE, force=False, directory=False, partitioned_columns=[]): """Helper function to generate parquet files/directories. Args: row_size: Number of rows for the dataframe. force: Create a new file/directory even if one already exists. directory: Create a partitioned directory using pyarrow. partitioned_columns: Create a partitioned directory using pandas. Will be ignored if directory=True. """ df = pandas.DataFrame({ "col1": np.arange(row_size), "col2": np.arange(row_size) }) if os.path.exists(TEST_PARQUET_FILENAME) and not force: pass elif directory: if os.path.exists(TEST_PARQUET_FILENAME): shutil.rmtree(TEST_PARQUET_FILENAME) else: os.mkdir(TEST_PARQUET_FILENAME) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=TEST_PARQUET_FILENAME) elif len(partitioned_columns) > 0: df.to_parquet(TEST_PARQUET_FILENAME, partition_cols=partitioned_columns) else: df.to_parquet(TEST_PARQUET_FILENAME)
def test_parquet_row_group_fragments(tempdir): import pyarrow as pa import pyarrow.parquet as pq table = pa.table({'a': ['a', 'a', 'b', 'b'], 'b': [1, 2, 3, 4]}) # write_to_dataset currently requires pandas pq.write_to_dataset(table, str(tempdir / "test_parquet_dataset"), partition_cols=["a"]) import pyarrow.dataset as ds dataset = ds.dataset(str(tempdir / "test_parquet_dataset/"), format="parquet", partitioning="hive") fragments = list(dataset.get_fragments()) f = fragments[0] parquet_format = f.format parquet_format.make_fragment(f.path, f.filesystem, partition_expression=f.partition_expression) parquet_format.make_fragment(f.path, f.filesystem, partition_expression=f.partition_expression, row_groups={1})
def write_df_to_parquet_to_s3(df: pd.DataFrame, filename: str, s3_bucketname: str, s3_bucketkey=None): # TODO: Need to figure out how to modify this file so it doesn't write the parquet file into the current working directory and then subsequently upload to S3. We want it to just upload directly to S3 (w/o having to write it to the current working directory) assert 's3://' not in s3_bucketname, 'prefix "s3://" not required' assert filename[-8:] == '.parquet', 'filename must have suffix ".parquet"' if 's3://' in s3_bucketname: pass else: s3_bucketname = 's3://' + s3_bucketname table = pa.Table.from_pandas(df) pq.write_table(table, filename) if s3_bucketkey is not None: key_to_use = s3_bucketkey + '/' + filename else: key_to_use = filename outputfile = s3_bucketname + '/' + key_to_use s3 = S3FileSystem() pq.write_to_dataset(table=table, root_path=outputfile, filesystem=s3)
def setup(self, num_partitions, num_threads): if pq is None: raise NotImplementedError self.tmpdir = tempfile.mkdtemp('benchmark_parquet') num1 = [random.choice(range(0, num_partitions)) for _ in range(self.size)] num2 = [random.choice(range(0, 1000)) for _ in range(self.size)] output_df = pd.DataFrame({'num1': num1, 'num2': num2}) output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
def setup(self, num_partitions, num_threads): if pq is None: raise NotImplementedError("Parquet support not enabled") self.tmpdir = tempfile.mkdtemp('benchmark_parquet') rnd = np.random.RandomState(42) num1 = rnd.randint(0, num_partitions, size=self.size) num2 = rnd.randint(0, 1000, size=self.size) output_df = pd.DataFrame({'num1': num1, 'num2': num2}) output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
def write_parquet_table_as_partitioned_dataset(parquet_file) -> pq.ParquetDataset: """ Write a parquet table as a parititioned dataset (i.e. multiple Parquet files) An example of a dataset partitioned by year and month on disk might look like: dataset_name/ year=2018/ month=09/ 0.parq 1.parq month=10/ 0.parq 1.parq """ parquet_table = pq.read_table(parquet_file) # Read back Parquet File as a Table #pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['created']) pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['year', 'month', 'day'], flavor='spark') dataset = pq.ParquetDataset('starships') return dataset
def _write_partition_pyarrow(df, open_with, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) if partition_on: parquet.write_to_dataset(t, path, partition_cols=partition_on, filesystem=fs) else: with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: # Get only arguments specified in the function kwargs_meta = {k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs} parquet.write_metadata(t.schema, fil, **kwargs_meta)
def run_raw(nrows=None, force=False): """ parse the data, save a parquet """ # not a lot of data here ... do it in memory, forget about parallelism outfile = raw_target(nrows=nrows) if force: if os.path.exists(outfile): shutil.rmtree(outfile) if os.path.exists(outfile): print('{} exists. force=True to rerun'.format(outfile)) return for k in ['ETFs', 'Stocks']: data = list() product = k.lower() filenames = raw_source(k) t = time.time() i = 0 for filename in filenames: name, market = os.path.basename(filename).split('.')[:2] product, market, name, filename, dict(nrows=nrows) lc = mylib.io.get_capped_line_count(filename) if lc >= 2: df = pd.read_csv(filename, nrows=nrows, encoding='utf-8') df['product'] = product df['market'] = market df['name'] = name df['Date'] = pd.to_datetime(df.Date) data.append(df) else: print('skipping {}'.format(filename)) i += 1 if i % 10 == 0: print('{} of {} eta {} seconds for {}'.format(i, len(filenames), (time.time() - t) * (len(filenames) - i) / i, product)) df = pd.concat(data, axis=0) df = df.sort_values(['name', 'Date']) # TODO: add market back later if relevant df = df.drop(['market', 'OpenInt'], axis=1) df.columns = [x.lower() for x in df.columns] print('writing {}'.format(outfile)) # plain to_parquet seems to be always writing the index, also partition_cols not in pandas yet table = pa.Table.from_pandas(df, preserve_index=False) # partitioning by name is less efficient storage wise but makes for better joins in the next step pq.write_to_dataset(table, root_path=outfile, partition_cols=['product', 'name'], preserve_index=False)
def download_update_one_from_yahoo(product, name, start_date=None, end_date=None): # example tan, if start_date is None use last avail date in the raw data output_dirname = download_update_target() filename = os.path.join if start_date is None: missing_back_dates, start_date = get_missing_dates(product, name) # TODO: worrying about exact timing if start_date >= (datetime.date.today() - datetime.timedelta(days=1)): print("{} {} has data up to but not including {}".format(product, name, start_date)) return df = get_data_yahoo([name], start=start_date, end=end_date) if missing_back_dates: raise Exception('nip') print('writing {}'.format(output_dirname)) df['product'] = product # TODO: better ways to do this but this is safer df['name'] = name # TODO: better ways to do this but this is safer df = df.reset_index() df.columns = [x.lower() for x in df.columns] table = pa.Table.from_pandas(df, preserve_index=False) # this CAN lead to duplicate entries so deduping is necessary, idea is that this is basically append-only like pq.write_to_dataset(table, root_path=output_dirname, partition_cols=['product', 'name'], preserve_index=False)
def one_off_update(product='etfs'): start = datetime.date(2010, 1, 1) end = datetime.date(2018, 11, 29) names = _meta[product] base = os.path.join('raw/yahoo/') if not os.path.exists(base): os.makedirs(base) # filenames = glob.glob(os.path.join(base, '*.parquet')) # TODO: check dates on existing for the update filename = os.path.join(base, '{}_to_{}'.format(start, end)) filename_check = os.path.join(filename, 'product={}'.format(product)) # this is terrible if os.path.exists(filename_check): print("{} exists".format(filename_check)) return print('getting {} names'.format(len(names))) df = get_data_yahoo(names, start, end) df['product'] = product table = pa.Table.from_pandas(df, preserve_index=False) # partitioning by name is less efficient storage wise but makes for better joins in the next step pq.write_to_dataset(table, root_path=filename, partition_cols=['product', 'name'], preserve_index=False) return df
def read_transform_write(infile, outfile): print('{} -> {}'.format(infile, outfile)) df = pd.read_parquet(infile) enrich_pandas_single(df, inplace=True) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_to_dataset(table, root_path=outfile, preserve_index=False)