def test_pandas_update_patch(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch") new = ingestion_test_repo.commit() old.checkout() new.checkout() assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (1, dt(2018, 1, 1, 0, 11, 11), "apple"), (2, dt(2018, 1, 2, 0, 22, 22), "orange"), (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise"), (4, dt(2018, 12, 30, 0, 0), "chandelier"), ] assert sorted(ingestion_test_repo.diff("test_table", old, new)) == [ (False, (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise")), (False, (4, dt(2018, 1, 4, 0, 44, 44), "mustard")), (True, (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise")), (True, (4, dt(2018, 12, 30, 0, 0), "chandelier")), ]
def test_pandas_read_basic(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() # We currently don't detect the index column name since it's an arbitrary query that's passed. output = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo, index_col="fruit_id") assert_frame_equal(base_df, output)
def test_pandas_update_type_changes_weaker(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") ingestion_test_repo.commit() altered_df = upd_df_1.copy() altered_df["name"] = [4, 5, 6] altered_df["name"] = altered_df["name"].astype(int) # Type changes are passed through to Postgres to see if it can coerce them -- in this case # 'name' remains a string, so a string '4' is written for fruit_id = 2. df_to_table(altered_df, ingestion_test_repo, "test_table", if_exists="patch") assert ingestion_test_repo.run_sql("SELECT name FROM test_table WHERE fruit_id = 2") == [("4",)]
def build_repo(): repo = Repository(namespace="abc", repository="1234") repo.delete() repo.init() df_to_table(fake_data(8), repository=repo, table="unit_test", if_exists='replace') new_img = repo.commit() new_img.checkout() return repo
def test_pandas_update_type_changes_stricter(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") ingestion_test_repo.commit() altered_df = upd_df_1.copy() altered_df.index = altered_df.index.map(str) # fruit_id (integer) is a string in this case -- Postgres should try to coerce # it back into integer to patch into the target table. df_to_table(altered_df, ingestion_test_repo, "test_table", if_exists="patch") assert ingestion_test_repo.run_sql("SELECT name FROM test_table WHERE fruit_id = 4") == [ ("chandelier",) ]
def test_evil_pandas_dataframes(ingestion_test_repo): # Test corner cases we found in the real world df = _str_to_df(load_csv("evil_df.csv"), has_ts=False) df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch") assert ingestion_test_repo.run_sql( "SELECT id, job_title, some_number FROM test_table ORDER BY id" ) == [ # Make sure backslashes don't break ingestion -- not exactly sure what the intention # in the original dataset was (job title is "PRESIDENT\"). (1, "PRESIDENT\\", 25), # Test characters that can be used as separators still make it into fields (2, "\t", 26), ]
def test_pandas_kv(ingestion_test_repo): # Test reads and writes with a key-value type dataframe. df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df_kv.csv"), index_col=0) df_to_table(df, ingestion_test_repo, "test_table") # Test patching works without specifying an index col df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "patch_df_kv.csv")) df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch") assert_frame_equal( sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo), pd.DataFrame(data=[(1, "banana"), (2, "kumquat"), (3, "pendulum")], columns=["key", "value"]), )
def test_pandas_basic_insert(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") ingestion_test_repo.commit() assert ingestion_test_repo.head.get_table("test_table").table_schema == [ TableColumn(1, "fruit_id", "bigint", True), TableColumn(2, "timestamp", "timestamp without time zone", False), TableColumn(3, "name", "text", False), ] assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (1, dt(2018, 1, 1, 0, 11, 11), "apple"), (2, dt(2018, 1, 2, 0, 22, 22), "orange"), (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise"), (4, dt(2018, 1, 4, 0, 44, 44), "mustard"), ]
def test_pandas_read_other_checkout(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch") new = ingestion_test_repo.commit() # Record the second version of the table patched_df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo) # Check out the old version but run the query against the new image -- new version should come out. old.checkout() output_1 = sql_to_df("SELECT * FROM test_table", image=new) assert_frame_equal(output_1, patched_df) # Test works with hashes output_2 = sql_to_df( "SELECT * FROM test_table", image=new.image_hash[:10], repository=ingestion_test_repo ) assert_frame_equal(output_2, patched_df)
def test_pandas_no_processing_insert(ingestion_test_repo): # Make sure everything still works when we don't have a PK. df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df.csv")) df_to_table(df, ingestion_test_repo, "test_table") ingestion_test_repo.commit() assert ingestion_test_repo.head.get_table("test_table").table_schema == [ TableColumn(1, "fruit_id", "bigint", False), TableColumn(2, "timestamp", "text", False), TableColumn(3, "name", "text", False), ] assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (1, "2018-01-01 00:11:11", "apple"), (2, "2018-01-02 00:22:22", "orange"), (3, "2018-01-03 00:33:33", "mayonnaise"), (4, "2018-01-04 00:44:44", "mustard"), ]
def test_pandas_update_replace(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="replace") new = ingestion_test_repo.commit() old.checkout() new.checkout() assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (2, dt(2018, 1, 2, 0, 22, 22), "orange"), (3, dt(2018, 12, 31, 23, 59, 49), "mayonnaise"), (4, dt(2018, 12, 30, 0, 0), "chandelier"), ] # Since the table was replaced, we store it as a new snapshot instead of a patch. assert len(ingestion_test_repo.images["latest"].get_table("test_table").objects) == 1
def test_pandas_update_different_schema(ingestion_test_repo): # Currently patches with dataframes with different columns are unsupported df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") ingestion_test_repo.commit() # Delete the 'timestamp' column truncated_df = upd_df_1["timestamp"] with pytest.raises(ValueError) as e: df_to_table(truncated_df, ingestion_test_repo, "test_table", if_exists="patch") assert "Schema changes are unsupported" in str(e.value) # Rename a column renamed_df = upd_df_1.copy() renamed_df.columns = ["timestamp", "name_rename"] with pytest.raises(ValueError) as e: df_to_table(renamed_df, ingestion_test_repo, "test_table", if_exists="patch") assert "Schema changes are unsupported" in str(e.value)
def run(self, request: DataFrameToTableRequest, repo_uri: str = None, **kwargs: Any): """ Args: Returns: """ assert repo_uri, 'Must specify repo_uri.' repo_info = parse_repo(repo_uri) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) df_to_table(request.data_frame, repository=repo, table=request.table, if_exists=request.if_exists, schema_check=self.schema_check)
def run(self, data_frame: pd.DataFrame, table: str = None, if_exists: str = None, repo_uri: str = None, **kwargs: Any): """ Args: Returns: """ assert repo_uri, 'Must specify repo_uri.' repo_info = parse_repo(repo_uri) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) df_to_table(data_frame, repository=repo, table=table, if_exists=if_exists)
def test_can_push(self): checkout = SemanticCheckoutTask(upstream_repos=dict( abc=f'sgr://{remote_name}/abc/1234?tag=1', ), ) workspaces = checkout.run() push = PushRepoTask(workspaces=workspaces, ) df_to_table(fake_data(10), repository=self.repo, table="unit_test", if_exists='replace') self.repo.commit() runner = TaskRunner(task=push) with raise_on_exception(): with prefect.context(): state = runner.run() if state.is_failed(): print(state) self.fail()
def test_can_commit(self): old_image_hash = self.repo.head.image_hash checkout = SemanticCheckoutTask(upstream_repos=dict( abc='sgr:///abc/1234?tag=1', ), ) workspaces = checkout.run() commit = CommitTask(workspaces=workspaces, ) df_to_table(fake_data(10), repository=self.repo, table="unit_test", if_exists='replace') runner = TaskRunner(task=commit) with raise_on_exception(): with prefect.context(): state = runner.run() if state.is_failed(): print(state) self.fail() self.assertNotEqual(self.repo.head.image_hash, old_image_hash)
def test_pandas_read_lq_checkout(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() # Record a DF with just the values for fruit_id = 3 and 4 (ones that will be updated). query = "SELECT * FROM test_table WHERE fruit_id IN (3, 4)" old_3_4 = sql_to_df(query, repository=ingestion_test_repo) df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch") new = ingestion_test_repo.commit() new_3_4 = sql_to_df(query, repository=ingestion_test_repo) # Uncheckout the repo but don't delete it. ingestion_test_repo.uncheckout() assert ingestion_test_repo.head is None output_old = sql_to_df(query, image=old, use_lq=True) assert_frame_equal(old_3_4, output_old) output_new = sql_to_df(query, image=new, use_lq=True) assert_frame_equal(new_3_4, output_new) # Make sure we didn't do an actual checkout. assert ingestion_test_repo.head is None
def test_pandas_read_roundtripping(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") old = ingestion_test_repo.commit() df_to_table(upd_df_1, ingestion_test_repo, "test_table", if_exists="patch") new = ingestion_test_repo.commit() df = sql_to_df("SELECT * FROM test_table", repository=ingestion_test_repo, index_col="fruit_id") # Pandas update syntax: update index 4 (fruit ID) to have a new timestamp. df.at[4, "timestamp"] = dt(2018, 1, 1, 1, 1, 1) # Write the whole df back in patch mode -- despite that, the changeset will only contain the updated cell. df_to_table(df, ingestion_test_repo, "test_table", if_exists="patch") new_2 = ingestion_test_repo.commit() assert ingestion_test_repo.diff("test_table", new, new_2) == [ (False, (4, dt(2018, 12, 30, 0, 0), "chandelier")), (True, (4, dt(2018, 1, 1, 1, 1, 1), "chandelier")), ]
def write(self, value_: Any, **kwargs: Any) -> Result: """ Writes the result to a repository on Splitgraph Args: - value_ (Any): the value to write; will then be stored as the `value` attribute of the returned `Result` instance - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag` Returns: - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes """ if self.schema is not None: errors = self.schema.validate(value_) if errors: raise SchemaValidationError(errors) new = self.format(**kwargs) new.value = value_ repo_info = parse_repo(new.location) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name, autocommit=True)) assert isinstance(value_, pd.DataFrame) if not repository_exists(repo) and self.auto_init_repo: self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() # TODO: Retrieve the repo from bedrock first self.logger.info("Starting to upload result to {}...".format( new.location)) with self.atomic(repo.engine): self.logger.info("checkout") img = repo.head img.checkout(force=True) self.logger.info("df to table") df_to_table(new.value, repository=repo, table=repo_info.table, if_exists='replace') self.logger.info("commit") new_img = repo.commit(comment=new.comment, chunk_size=10000) new_img.tag(repo_info.tag) # if (repo.diff(new.table, img, new_img)): if self.auto_push: self.logger.info("push") repo.push( remote, handler="S3", overwrite_objects=True, overwrite_tags=True, reupload_objects=True, ) self.logger.info("Finished uploading result to {}...".format( new.location)) return new
def write(self, value_: Any, **kwargs: Any) -> Result: """ Writes the result to a repository on Splitgraph Args: - value_ (Any): the value to write; will then be stored as the `value` attribute of the returned `Result` instance - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag` Returns: - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes """ cfg = patch_config(create_config_dict(), self.env or dict()) engine = PostgresEngine(name='SplitgraphResult', conn_params=cfg) engine.initialize() repo = Repository(namespace=self.namespace, repository=self.repo_name, engine=engine) assert isinstance(value_, pd.DataFrame) assert engine.connected if not repository_exists(repo) and self.auto_init_repo: self.logger.info("Creating repo {}/{}...".format(repo.namespace, repo.repository)) repo.init() # TODO: Retrieve the repo from bedrock first new = self.format(**kwargs) new.value = value_ self.logger.info("Starting to upload result to {}...".format(new.table)) with self.atomic(engine): self.logger.info("checkout") img = repo.head img.checkout(force=True) self.logger.info("df to table") df_to_table(new.value, repository=repo, table=new.table, if_exists='replace') self.logger.info("commit") new_img = repo.commit(comment=new.comment, chunk_size=10000) new_img.tag(new.tag) # if (repo.diff(new.table, img, new_img)): if self.auto_push: self.logger.info("push") repo.push( self.get_upstream(repo), handler="S3", overwrite_objects=True, overwrite_tags=True, reupload_objects=True, ) engine.close() self.logger.info("Finished uploading result to {}...".format(new.table)) return new