def test_file_path_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.read_avro(tf.name) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def write(self, df: pd.DataFrame, schema_key: str, file_key: str) -> None: schema = self.__parse_schema(path=schema_key) pandavro.to_avro(file_key, df, schema=schema, append=False, codec='snappy')
def test_delegation(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.from_avro(tf.name) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def test_buffer_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) with open(tf.name, 'rb') as f: expect = pdx.from_avro(BytesIO(f.read())) assert_frame_equal(expect, dataframe) f.close()
def test_get_batch_features_with_file(client): file_fs1 = client.get_feature_set(name="file_feature_set", version=1) N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame( { "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value1": [f"{i}" for i in range(N_ROWS)], } ) client.ingest(file_fs1, features_1_df, timeout=480) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) to_avro(df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer="file_feature_set.avro") time.sleep(15) feature_retrieval_job = client.get_batch_features( entity_rows="file://file_feature_set.avro", feature_refs=[f"{PROJECT_NAME}/feature_value1:1"] ) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["entity_id"].to_list() == [int(i) for i in output["feature_value1"].to_list()]
def run(self) -> bool: """ Runs the command Returns: False on failure """ # load schema schema = self.get_schema() # load sql file sql_query = self.get_sql_query() # query data frame from db logger.log(f'Read data from SQL', format=logger.Format.ITALICS) df = read_dataframe(self.db_alias, sql_query) # write avro file avro_file_path = f"{pathlib.Path(config.data_dir()) / self.file_name}" logger.log(f'Write to AVRO file {avro_file_path}', format=logger.Format.ITALICS) pdx.to_avro(avro_file_path, df, schema=schema) return True
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred :param schema: Avro Schema determines dtypes saved """ import pandavro as pdx kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] del kw["schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def write_with_compression(df, compression): start = timer() pdx.to_avro(OUTPUT_FILE_PATH, df, codec=compression) end = timer() print('Time to write avro with {} compression: {} seconds'.format( compression, end - start)) print('Resulting size: {}'.format( util.get_readable_file_size(OUTPUT_FILE_PATH)))
def test_buffer_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) with open(tf.name, 'rb') as f: expect = pdx.read_avro(BytesIO(f.read())) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def test_append(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe[0:int(dataframe.shape[0] / 2)]) pdx.to_avro(tf.name, dataframe[int(dataframe.shape[0] / 2):], append=True) expect = pdx.from_avro(tf.name) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def test_batch_get_historical_features_with_file(client): file_fs1 = client.get_feature_set(name="file_feature_set") N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame( { "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value1": [f"{i}" for i in range(N_ROWS)], } ) # feature set may be ready (direct runner set ready right after job submitted), # but kafka consumer is not configured # give some time to warm up ingestion job wait_retry_backoff( retry_fn=( lambda: ( None, client.get_feature_set(name="file_feature_set").status == FeatureSetStatus.STATUS_READY, ) ), timeout_secs=480, timeout_msg="Wait for FeatureSet to be READY", ) time.sleep(20) client.ingest(file_fs1, features_1_df, timeout=480) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) to_avro( df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer="file_feature_set.avro", ) time.sleep(10) def check(): feature_retrieval_job = client.get_historical_features( entity_rows="file://file_feature_set.avro", feature_refs=["feature_value1"], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_value1"].to_list() ] clean_up_remote_files(feature_retrieval_job.get_avro_files()) wait_for(check, timedelta(minutes=10))
def save(self, save_path: str) -> None: save_file = self.setup_save_file(save_path=save_path, extension="avro") structured_data = self._get_structured_data() dataframe = pandas.DataFrame.from_dict(structured_data) pandavro.to_avro(save_file, dataframe) self._log_save(save_file)
def serialize_panda_df_to_str(df: pd.DataFrame, schema: Dict) -> str: with io.BytesIO() as bytes_io: # else we get: ValueError: NaTType does not support timestamp # it's really a pandavro issue, see https://github.com/fastavro/fastavro/issues/313 # TODO(talebz): Create a Pandavro issue for this! df = df.replace({np.nan: None}) pandavro.to_avro(bytes_io, df, schema=schema) bytes_io.seek(0) return base64.b64encode(bytes_io.read()).decode("utf-8")
def _save_avro(df: LocalDataFrame, p: FileParser, columns: Any = None, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred """ kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] if schema is None: if columns is not None: schema = _convert_pyarrow_to_avro_schema(df, columns) else: if columns: # both schema and columns provided raise Exception("set columns to None when schema is provided") del kw["infer_schema"] if "infer_schema" in kw: infer_schema = kw["infer_schema"] if infer_schema and (schema is not None): # infer_schema set to True but schema was provided raise Exception( "set infer_schema to False when schema is provided") del kw["infer_schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def test_batch_get_historical_features_with_gs_path(client, gcs_path): gcs_fs1 = client.get_feature_set(name="gcs_feature_set") N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame( { "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value2": [f"{i}" for i in range(N_ROWS)], } ) client.ingest(gcs_fs1, features_1_df, timeout=360) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) # Output file to local file_name = "gcs_feature_set.avro" to_avro( df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer=file_name, ) uri = urlparse(gcs_path) bucket = uri.hostname ts = int(time.time()) remote_path = str(uri.path).strip("/") + f"/{ts}/{file_name}" # Upload file to gcs storage_client = storage.Client(project=None) bucket = storage_client.get_bucket(bucket) blob = bucket.blob(remote_path) blob.upload_from_filename(file_name) time.sleep(10) def check(): feature_retrieval_job = client.get_historical_features( entity_rows=f"{gcs_path}/{ts}/*", feature_refs=["feature_value2"], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_value2"].to_list() ] clean_up_remote_files(feature_retrieval_job.get_avro_files()) blob.delete() wait_for(check, timedelta(minutes=5))
def main(): df = pd.DataFrame({"Boolean": [True, False, True, False], "Float64": np.random.randn(4), "Int64": np.random.randint(0, 10, 4), "String": ['foo', 'bar', 'foo', 'bar'], "DateTime64": [pd.Timestamp('20190101'), pd.Timestamp('20190102'), pd.Timestamp('20190103'), pd.Timestamp('20190104')]}) pdx.to_avro(OUTPUT_PATH, df) saved = pdx.read_avro(OUTPUT_PATH) print(saved)
def save_transformed_data(data): try: print('saving as Parquet') data.to_parquet(cs.TRANSFORMED_DATA_PATH_PARQUET) print('saving as AVRO') data.to_csv('filtered.csv') new_data = pd.read_csv('filtered.csv', keep_default_na=False) pdx.to_avro(cs.TRANSFORMED_DATA_PATH_AVRO, new_data) print('saving as JSON gzip') data.to_json(cs.TRANSFORMED_DATA_PATH_JSON, compression='gzip') except Exception as error: print(error)
def _write_avro(df, tmpfile, times_as_micros=False, *args, **kwargs): """ Saves a DataFrame to Avro format Args: obj (pd.DataFrame): The DataFrame to be written to Avro tmpfile (tempfile.NamedTemporaryFile): Connection to the file to be written to save_datetimes_as_millis (bool): Whether to save timestamps as milliseconds or leave it as the pandavro default microseconds """ pdx.to_avro(tmpfile, df, times_as_micros=times_as_micros, *args, **kwargs)
def put(self, data: pd.DataFrame, short_description: str) -> DatasetVersion: ns: str = self.__namespace or '_' ds: str = self.__name file: BytesIO = BytesIO() pandavro.to_avro(file, data) file.seek(0) resp = client.put('/datasets/' + ns + '/' + ds + '/versions', files = { 'message': short_description, 'file': file }) return self.version(resp.json())
def test_get_batch_features_with_gs_path(client, gcs_path): gcs_fs1 = FeatureSet( "gcs_feature_set", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(gcs_fs1) gcs_fs1 = client.get_feature_set(name="gcs_feature_set", version=1) N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": [f"{i}" for i in range(N_ROWS)], }) client.ingest(gcs_fs1, features_1_df) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename( columns={"datetime": "event_timestamp"}) # Output file to local file_name = "gcs_feature_set.avro" to_avro(df=features_1_df, file_path_or_buffer=file_name) uri = urlparse(gcs_path) bucket = uri.hostname ts = int(time.time()) remote_path = str(uri.path).strip("/") + f"{ts}/{file_name}" # Upload file to gcs storage_client = storage.Client(project=None) bucket = storage_client.get_bucket(bucket) blob = bucket.blob(remote_path) blob.upload_from_filename(file_name) feature_retrieval_job = client.get_batch_features( entity_rows=f"{gcs_path}{ts}/*", feature_ids=["gcs_feature_set:1:feature_value"]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["gcs_feature_set_v1_feature_value"].to_list() ]
def converter_csv_to_avro(INPUT_PATH, OUTPUT_PATH, converter_to_datetime): df = pd.read_csv(INPUT_PATH) # Trasnform columns string to datetime for columns_to_converter in converter_to_datetime: df[columns_to_converter] = pd.to_datetime(df[columns_to_converter]) print(df.info()) pdx.to_avro(OUTPUT_PATH, df) # Converter saved = pdx.read_avro(OUTPUT_PATH) # Only read to control print(saved) return
def export_dataframe_to_local( df: pd.DataFrame, dir_path: Optional[str] = None) -> Tuple[str, str, str]: """ Exports a pandas DataFrame to the local filesystem. Args: df (pd.DataFrame): Pandas DataFrame to save. dir_path (Optional[str]): Absolute directory path '/data/project/subfolder/'. Returns: Tuple[str, str, str]: Tuple of directory path, file name and destination path. The destination path can be obtained by concatenating the directory path and file name. """ # Create local staging location if not provided if dir_path is None: dir_path = tempfile.mkdtemp() file_name = _get_file_name() dest_path = f"{dir_path}/{file_name}" # Temporarily rename datetime column to event_timestamp. Ideally we would # force the schema with our avro writer instead. df.columns = [ "event_timestamp" if col == "datetime" else col for col in df.columns ] try: # Export dataset to file in local path to_avro(df=df, file_path_or_buffer=dest_path) except Exception: raise finally: # Revert event_timestamp column to datetime df.columns = [ "datetime" if col == "event_timestamp" else col for col in df.columns ] return dir_path, file_name, dest_path
def export_dataframe_to_local(df: pd.DataFrame, dir_path: Optional[str] = None): """ Exports a pandas dataframe to the local filesystem :param df: Pandas dataframe to save :param dir_path: (optional) Absolute directory path '/data/project/subfolder/' :return: """ # Create local staging location if not provided if dir_path is None: dir_path = tempfile.mkdtemp() file_name = f'{datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")}_{str(uuid.uuid4())[:8]}.avro' dest_path = f"{dir_path}/{file_name}" # Export dataset to file in local path to_avro(df=df, file_path_or_buffer=dest_path) return dir_path, file_name, dest_path
def test_dataframe_kwargs(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) # include columns columns = ['Boolean', 'Int64'] expect = pdx.read_avro(tf.name, columns=columns) df = dataframe[columns] assert_frame_equal(expect, df) # exclude columns columns = ['String', 'Boolean'] expect = pdx.read_avro(tf.name, exclude=columns) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) df = dataframe.drop(columns, axis=1) assert_frame_equal(expect, df) # specify index index = 'String' expect = pdx.read_avro(tf.name, index=index) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) df = dataframe.set_index(index) assert_frame_equal(expect, df)
def test_batch_get_batch_features_with_file(client): file_fs1 = client.get_feature_set(name="file_feature_set") N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value1": [f"{i}" for i in range(N_ROWS)], }) client.ingest(file_fs1, features_1_df, timeout=480) # Rename column (datetime -> event_timestamp) features_1_df['datetime'] + pd.Timedelta( seconds=1) # adds buffer to avoid rounding errors features_1_df = features_1_df.rename( columns={"datetime": "event_timestamp"}) to_avro( df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer="file_feature_set.avro", ) time.sleep(15) feature_retrieval_job = client.get_batch_features( entity_rows="file://file_feature_set.avro", feature_refs=["feature_value1"], project=PROJECT_NAME, ) output = feature_retrieval_job.to_dataframe() clean_up_remote_files(feature_retrieval_job.get_avro_files()) print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["feature_value1"].to_list() ]
def test_get_batch_features_with_file(client): file_fs1 = FeatureSet( "file_feature_set", features=[Feature("feature_value", ValueType.STRING)], entities=[Entity("entity_id", ValueType.INT64)], max_age=Duration(seconds=100), ) client.apply(file_fs1) file_fs1 = client.get_feature_set(name="file_feature_set", version=1) N_ROWS = 10 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "feature_value": [f"{i}" for i in range(N_ROWS)], }) client.ingest(file_fs1, features_1_df) # Rename column (datetime -> event_timestamp) features_1_df = features_1_df.rename( columns={"datetime": "event_timestamp"}) to_avro(df=features_1_df, file_path_or_buffer="file_feature_set.avro") feature_retrieval_job = client.get_batch_features( entity_rows="file://file_feature_set.avro", feature_ids=["file_feature_set:1:feature_value"]) output = feature_retrieval_job.to_dataframe() print(output.head()) assert output["entity_id"].to_list() == [ int(i) for i in output["file_feature_set_v1_feature_value"].to_list() ]
def export_dataframe_to_local(df: pd.DataFrame, dir_path: Optional[str] = None): """ Exports a pandas dataframe to the local filesystem Args: df: Pandas dataframe to save dir_path: (optional) Absolute directory path '/data/project/subfolder/' """ # Create local staging location if not provided if dir_path is None: dir_path = tempfile.mkdtemp() file_name = f'{datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")}_{str(uuid.uuid4())[:8]}.avro' dest_path = f"{dir_path}/{file_name}" # Temporarily rename datetime column to event_timestamp. Ideally we would # force the schema with our avro writer instead. df.columns = [ "event_timestamp" if col == "datetime" else col for col in df.columns ] try: # Export dataset to file in local path to_avro(df=df, file_path_or_buffer=dest_path) except Exception: raise finally: # Revert event_timestamp column to datetime df.columns = [ "datetime" if col == "event_timestamp" else col for col in df.columns ] return dir_path, file_name, dest_path
def avro_data_path(self): final_results = tempfile.mktemp() pandavro.to_avro(file_path_or_buffer=final_results, df=TEST_DATA_FRAME) return final_results
def test_delegation(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.from_avro(tf.name) assert_frame_equal(expect, dataframe)
def test_file_path_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.read_avro(tf.name) assert_frame_equal(expect, dataframe)