def test_one_table_one_identifier(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], identifier="ABC") reference = sample_table.query("externalId == 'ABC'").reset_index( drop=True) result[0].reset_index(drop=True).equals(reference)
def test_one_table_one_query(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], query=["bool_property = true"]) reference = sample_table.query("bool_property == True").reset_index( drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_table_multiple_identifier(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], identifier=["ABC", "FGH"]) reference = sample_table.query( "externalId == 'ABC' or " "externalId == 'FGH'").reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_change_identifier_col(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], identifier="red", identifier_col="str_property") reference = sample_table.query("str_property == 'red'").reset_index( drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_change_substudy_col(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], substudy="blue", substudy_col="str_property") reference = sample_table[['blue' in s for s in sample_table.str_property]] reference = reference.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_table_function_identifier(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], identifier=lambda s: s.startswith("A")) reference = sample_table[[ s.startswith("A") for s in sample_table.externalId ]] reference = reference.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_table_one_substudy(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], substudy="my-study") reference = sample_table[[ 'my-study' in s for s in sample_table.substudyMemberships ]] reference = reference.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_table_multiple_substudy(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], substudy=["my-study", "other-study"]) reference = sample_table.query( "'my_study' in substudyMemberships or " "'other-study' in substudyMemberships").reset_index(drop=True) reference = sample_table[pd.np.array( ['my-study' in s for s in sample_table.substudyMemberships]) | pd.np.array( ['other-study' in s for s in sample_table.substudyMemberships])] reference = reference.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_everything(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"], query=["str_property = 'blue'"], substudy="other-study", identifier="DEF") reference = sample_table.query( "str_property == 'blue' and externalId == 'DEF'") reference = reference[[ 'other-study' in s for s in reference.substudyMemberships ]] reference = reference.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_mix_everything(syn, tables, sample_table): result = query_across_tables(syn, tables=[s["id"] for s in tables["schema"]], query=["bool_property = true"], substudy="other-study", identifier=["DEF", "ABC", "FGH"]) reference = sample_table.query("bool_property == True") reference = reference[[ 'other-study' in s for s in reference.substudyMemberships ]] reference = reference[[ s in ["DEF", "ABC", "FGH"] for s in reference.externalId ]] reference = reference.reset_index(drop=True) assert (result[0].reset_index(drop=True).equals(reference) and result[1].reset_index(drop=True).equals(reference))
def export_tables(syn, table_mapping, source_tables=None, target_project=None, update=True, reference_col="recordId", copy_file_handles=None, **kwargs): """Copy rows from one Synapse table to another. Or copy tables to a new table in a separate project. Parameters ---------- syn : synapseclient.Synapse table_mapping : dict, list, or str If exporting records of one or more tables to other, preexisting tables, table_mapping is a dictionary containing Synapse ID key/value mappings from source to target tables. If exporting table records to not yet created tables in a seperate project, table_mapping can be a list or string. source_tables : dict A mapping from Synapse ID (str) values of source tables to pandas.DataFrame where each DataFrame consists of rows to be copied from the source table. This is a more explicit way to specify which rows will be copied from source to target tables. To implicitly specify which rows will be copied, pass named arguments to **kwargs that will be used in a call to synapsebridgehelpers.query_across_tables. target_project : str, default None If exporting table records to not yet created tables in a seperate project, specify the target project's Synapse ID here. update : bool, default True When exporting records of one or more tables to other, preexisting tables, whether to append new records to the target tables or completely overwrite the table records. Note that rows in the target table that match on the `reference_col` of rows in the source table will not be updated to match the values in the source table even if `update` is True. reference_col : str or list If `update` is True, use this column(s) as the table index to determine which records are already present in the target table. copy_file_handles : bool, default None Whether to copy the file handles from the source table to the target table. By default, we will attempt to copy the source table records without copying any file handles (copy_file_handles = None) and, if an error is thrown, we will then copy the file handles before attempting to store the table again. If the user explicitly sets copy_file_handles = False, an exception will be raised if any of the file handles in the source table are not owned by the user. Setting copy_file_handles = True always creates copies of file handles, whether the user owns them or not. **kwargs Additional named arguments to pass to synapsebridgehelpers.query_across_tables Returns ------- """ results = {} if isinstance(table_mapping, (list, str)): # export to brand new tables if target_project is None: raise TypeError("If passing a list to table_mapping, " "target_project must be set.") if source_tables is None: new_records = synapsebridgehelpers.query_across_tables( syn, tables=table_mapping, as_data_frame=True, **kwargs) if isinstance(table_mapping, str): source_tables = {table_mapping: new_records[0]} else: source_tables = { t: df for t, df in zip(table_mapping, new_records) } for source_id, source_table in source_tables.items(): source_table_info = syn.get(source_id) source_table_cols = list(syn.getTableColumns(source_id)) if copy_file_handles: source_table = replace_file_handles( syn, df=source_table, source_table_id=source_id, source_table_cols=source_table_cols) try: target_table = _store_dataframe_to_table( syn, df=source_table, df_cols=source_table_cols, parent_id=target_project, table_name=source_table_info["name"], used=source_id) except sc.core.exceptions.SynapseHTTPError as e: # we don't own the file handles if copy_file_handles: # actually we do, something else is wrong raise sc.core.exceptions.SynapseHTTPError( "There was an issue storing records from {} " "to {}.".format(source_id, target_project)) from e elif copy_file_handles is False: # user explicitly specified no copies raise e else: source_table = replace_file_handles( syn, df=source_table, source_table_id=source_id, source_table_cols=source_table_cols) target_table = _store_dataframe_to_table( syn, df=source_table, df_cols=source_table_cols, parent_id=target_project, table_name=source_table_info["name"], used=source_id) results[source_id] = (target_table.tableId, source_table) elif isinstance(table_mapping, dict): # export to preexisting tables tables = list(table_mapping) if source_tables is None: new_records = synapsebridgehelpers.query_across_tables( syn, tables, **kwargs) source_tables = {t: df for t, df in zip(tables, new_records)} for source, target in table_mapping.items(): source_table = source_tables[source] if source_table.shape[0] == 0: continue target_table = syn.tableQuery("select * from {}".format(target)) target_table = target_table.asDataFrame() # has the schema changed? source_cols = list(syn.getTableColumns(source)) target_cols = list(syn.getTableColumns(target)) schema_comparison = compare_schemas(source_cols=source_cols, target_cols=target_cols, source_table=source_table, target_table=target_table) try: # error after updating schema -> data may be lost from target table if sum(list(map(len, schema_comparison.values()))) > 0: synchronize_schemas(syn, schema_comparison=schema_comparison, source=source, target=target, source_cols=source_cols, target_cols=target_cols) # synchronize schema of pandas DataFrame with Synapse for col in schema_comparison["removed"]: target_table = target_table.drop(col, axis=1) target_table = target_table.rename( schema_comparison["renamed"], axis=1) target_table = _sanitize_dataframe(syn, target_table, target) target_table = target_table.reset_index(drop=True) syn.store( sc.Table(target, target_table, headers=source_cols)) except Exception as e: dump_on_error(target_table, e, syn, source, target) if update: if reference_col is not None: source_table = source_table.set_index(reference_col, drop=False) target_table = target_table.set_index(reference_col, drop=False) else: raise TypeError( "If updating target tables with new records " "from a source table, you must specify a " "reference column as a basis for comparison.") new_records = source_table.loc[source_table.index.difference( target_table.index)] if len(new_records): source_table_info = syn.get(source) source_table_cols = list(syn.getTableColumns(source)) if (copy_file_handles): new_records = replace_file_handles( syn, df=new_records, source_table_id=source, source_table_cols=source_table_cols) try: target_table = _store_dataframe_to_table( syn, df=new_records, df_cols=source_table_cols, table_id=target, used=source) except sc.core.exceptions.SynapseHTTPError as e: # we don't own the file handles if copy_file_handles: # actually we do, something else is wrong raise sc.core.exceptions.SynapseHTTPError( "There was an issue storing records from {} " "to {}.".format(source, target)) from e elif copy_file_handles is False: # user specified no copies raise e else: source_table = replace_file_handles( syn, df=new_records, source_table_id=source, source_table_cols=source_table_cols) target_table = _store_dataframe_to_table( syn, df=new_records, df_cols=source_table_cols, table_id=target, used=source) results[source] = (target, new_records) else: # delete existing rows, store upstream rows target_table = syn.tableQuery( "select * from {}".format(target)) syn.delete(target_table.asRowSet()) source_cols = list(syn.getTableColumns(source)) table_to_store = source_table if copy_file_handles: table_to_store = replace_file_handles( syn, df=source_table, source_table_id=source, source_table_cols=source_cols) try: target_table = _store_dataframe_to_table( syn, df=table_to_store, df_cols=source_cols, table_id=target, used=source) except sc.core.exceptions.SynapseHTTPError as e: # we don't own the file handles if copy_file_handles: # actually we do, something else is wrong raise sc.core.exceptions.SynapseHTTPError( "There was an issue storing records from {} " "to {}.".format(source, target)) from e elif copy_file_handles is False: # user specified no copies raise e else: table_to_store = replace_file_handles( syn, df=table_to_store, source_table_id=source, source_table_cols=source_cols) target_table = _store_dataframe_to_table( syn, df=table_to_store, df_cols=source_cols, table_id=target, used=source) results[source] = (target, table_to_store) else: raise TypeError( "table_mapping must be either a list (if exporting " "tables to a target_project), str (if exporting a single " "table to a project), or a dict (if exporting " "tables to preexisting tables).") return results
def test_str_query(syn, tables, sample_table): result = query_across_tables(syn, tables=[s["id"] for s in tables["schema"]], query="select * from {}") reference = sample_table.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
def test_one_table(syn, tables, sample_table): result = query_across_tables(syn, tables=tables["schema"][0]["id"]) reference = sample_table.reset_index(drop=True) pd.testing.assert_frame_equal(result[0].reset_index(drop=True), sample_table)
def test_str_query_exception(syn, tables): with pytest.raises(TypeError): query_across_tables(syn, [s["id"] for s in tables["schema"]], query="select * from {}", substudy="my-study")
def test_two_tables(syn, tables, sample_table): result = query_across_tables(syn, tables=[s["id"] for s in tables["schema"]]) reference = sample_table.reset_index(drop=True) assert (result[0].reset_index(drop=True).equals(reference) and result[1].reset_index(drop=True).equals(reference))