async def _create(self): name = '_TEMP_{0}_TEMP_'.format(str(uuid.uuid4())) logging.info('Creating file view project: {0}'.format(name)) self.view_project = await SynapseProxy.storeAsync( syn.Project(name=name)) logging.info('Creating file view: {0}'.format(name)) cols = [ syn.Column(name=self.COL_ID, columnType='ENTITYID'), syn.Column(name=self.COL_DATAFILEHANDLEID, columnType='FILEHANDLEID'), syn.Column(name=self.COL_NAME, columnType='STRING', maximumSize=256) ] schema = syn.EntityViewSchema( name=name, columns=cols, properties=None, parent=self.view_project, scopes=[self.scope], includeEntityTypes=[syn.EntityViewType.FILE], addDefaultViewColumns=False, addAnnotationColumns=False) self.view = await SynapseProxy.storeAsync(schema)
def test_table_query(): """Test command line ability to do table query. """ cols = [] cols.append( synapseclient.Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( synapseclient.Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(synapseclient.Column(name='x', columnType='DOUBLE')) cols.append(synapseclient.Column(name='age', columnType='INTEGER')) cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN')) project_entity = project schema1 = syn.store( synapseclient.Schema(name=str(uuid.uuid4()), columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [['Chris', 'bar', 11.23, 45, False], ['Jen', 'bat', 14.56, 40, False], ['Jane', 'bat', 17.89, 6, False], ['Henry', 'bar', 10.12, 1, False]] row_reference_set1 = syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) # Test query output = run('synapse', '--skip-checks', 'query', 'select * from %s' % schema1.id) output_rows = output.rstrip("\n").split("\n") # Check the length of the output assert len(output_rows) == 5, "got %s rows" % (len(output_rows), ) # Check that headers are correct. # Should be column names in schema plus the ROW_ID and ROW_VERSION my_headers_set = output_rows[0].split("\t") expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list( map(lambda x: x.name, cols)) assert my_headers_set == expected_headers_set, "%r != %r" % ( my_headers_set, expected_headers_set)
def create_cols(table_type, syn=None): if table_type == MC10_SENSOR_NAME: cols = [ sc.Column(name="task_id", columnType="STRING"), sc.Column(name="sensor_location", columnType="STRING"), sc.Column(name="mc10_accelerometer", columnType="FILEHANDLEID"), sc.Column(name="mc10_gyroscope", columnType="FILEHANDLEID"), sc.Column(name="mc10_emg", columnType="FILEHANDLEID") ] elif table_type == SMARTWATCH_SENSOR_NAME: cols = [ sc.Column(name="task_id", columnType="STRING"), sc.Column(name="smartwatch_accelerometer", columnType="FILEHANDLEID") ] elif table_type == "scores": cols = list(syn.getTableColumns(SCORES)) for c in cols: c.pop('id') if c['name'] in SCORES_COL_MAP: c['name'] = SCORES_COL_MAP[c['name']] cols = [sc.Column(name="task_id", columnType="STRING")] + cols else: raise TypeError("table_type must be one of [{}, {}, {}]".format( MC10_SENSOR_NAME, SMARTWATCH_SENSOR_NAME, "scores")) return cols
def _create_view(self, entity_types): name = '_TEMP_{0}_VIEW_'.format(str(uuid.uuid4())) cols = [ syn.Column(name=self.COL_BENEFACTORID, columnType='ENTITYID'), syn.Column(name=self.COL_PROJECTID, columnType='ENTITYID') ] schema = syn.EntityViewSchema(name=name, columns=cols, properties=None, parent=self.view_project, scopes=[self.scope], includeEntityTypes=entity_types, addDefaultViewColumns=False, addAnnotationColumns=False) return SynapseProxy.client().store(schema)
def _keyValCols(keys, values, asSynapseCols): """ Get Synapse Column compatible objects from `keys` and `values`. Parameters ---------- keys : list Column names. values : list `defaultValue`s of each column. asSynapseCols : bool Whether to return as synapseclient.Column objects. Returns ------- A list of dictionaries compatible with synapseclient.Column objects. """ sanitize = lambda v: v if pd.notnull(v) else '' keys = list(map(sanitize, keys)) values = list(map(sanitize, values)) val_length = map(lambda v: len(v) if v else 50, values) cols = [{ 'name': k, 'maximumSize': l, 'columnType': "STRING", "defaultValue": v } for k, v, l in zip(keys, values, val_length)] if asSynapseCols: cols = list(map(lambda c: sc.Column(**c), cols)) return cols
def test_add_column(syn, new_tables): source_table = new_tables["schema"][0] target_table = new_tables["schema"][1] new_column = sc.Column(columnType="STRING", maximumSize=5, name="new_col") source_table.addColumn(new_column) source_table = syn.store(source_table) target_table = synchronize_schemas( syn, schema_comparison={"added": ["new_col"]}, source=source_table["id"], target=target_table["id"]) source_cols = [c["name"] for c in syn.getTableColumns(source_table["id"])] target_cols = [c["name"] for c in syn.getTableColumns(target_table["id"])] assert all([c in source_cols for c in target_cols])
def update_samples_in_release_table(syn, file_mapping, release, samples_in_release_synid): """ Updates the sample in release table This tracks the samples of each release. 1 means it exists, and 0 means it doesn't Args: syn: synapse object file_mapping: file mapping generated from file mapping function release: GENIE release number (ie. 5.3-consortium) samples_in_release_synid: Synapse Id of 'samples in release' Table """ clinical_ent = syn.get(file_mapping["clinical"], followLink=True) clinicaldf = pd.read_csv(clinical_ent.path, sep="\t", comment="#") cols = [ i["name"] for i in list(syn.getTableColumns(samples_in_release_synid)) ] if release not in cols: schema = syn.get(samples_in_release_synid) syn_col = synapseclient.Column(name=release, columnType="INTEGER", defaultValue=0) new_column = syn.store(syn_col) schema.addColumn(new_column) schema = syn.store(schema) # Columns of samples in release samples_per_release = syn.tableQuery( 'SELECT SAMPLE_ID, "{}" FROM {}'.format(release, samples_in_release_synid)) samples_per_releasedf = samples_per_release.asDataFrame() new_samples = clinicaldf[[ "SAMPLE_ID" ]][~clinicaldf.SAMPLE_ID.isin(samples_per_releasedf.SAMPLE_ID)] new_samples[release] = 1 old_samples = clinicaldf[["SAMPLE_ID"]][clinicaldf.SAMPLE_ID.isin( samples_per_releasedf.SAMPLE_ID)] old_samples[release] = 1 samples_in_releasedf = new_samples.append(old_samples) process_functions.updateDatabase( syn, samples_per_releasedf, samples_in_releasedf, samples_in_release_synid, ["SAMPLE_ID"], )
def table_schema(project_obj): cols = [synapseclient.Column(name="recordId", columnType="INTEGER"), synapseclient.Column(name="externalId", columnType="STRING"), synapseclient.Column(name="substudyMemberships", columnType="STRING"), synapseclient.Column(name="bool_property", columnType="BOOLEAN"), synapseclient.Column(name="str_property", columnType="STRING"), synapseclient.Column(name="raw_data", columnType="FILEHANDLEID")] schema = synapseclient.Schema(name = str(uuid.uuid4()), columns = cols, parent = project_obj["id"]) return schema
def _create_table(syn: Synapse, name: str, col_config: List[dict], parent: str) -> Schema: """Create Synapse Table Args: syn: Synapse connection name: Table name col_config: Column dict configuration parent: Synapse id of project Returns: Stored Synapse Table """ cols = [synapseclient.Column(**col) for col in col_config] schema = synapseclient.Schema(name=name, columns=cols, parent=parent) schema = syn.store(schema) return schema
def _create_table(syn: Synapse, name: str, col_config: List[dict], parent: str) -> Schema: """Create Synapse Table Args: syn: Synapse connection name: Table name col_config: Column dict configuration parent: Synapse id of project Returns: Stored Synapse Table """ cols = [synapseclient.Column(**col) for col in col_config] schema = process_functions._create_schema(syn, table_name=name, parentid=parent, columns=cols) return schema
def createColumnsFromJson(path, defaultMaximumSize=250): """ Create a list of Synapse Table Columns from a Synapse annotations JSON file. This creates a list of columns; if the column is a 'STRING' and defaultMaximumSize is specified, change the default maximum size for that column. :param json_file: :param defaultMaximumSize: :return: """ with open(path) as json_file: data = json.load(json_file) cols = [] for d in data: d['enumValues'] = [a['value'] for a in d['enumValues']] if d['columnType'] == 'STRING' and defaultMaximumSize: d['maximumSize'] = defaultMaximumSize cols.append(synapseclient.Column(**d)) return cols
def addToScope(syn, target, scope): """ Add further Folders/Projects to the scope of a file view. Parameters ---------- syn : synapseclient.Synapse target : str, synapseclient.Schema The Synapse ID of the file view to update or its schema. scope : str, list The Synapse IDs of the entites to add to the scope. Returns ------- synapseclient.Schema """ scope = [scope] if isinstance(scope, str) else scope target = syn.get(target) if isinstance(target, str) else target cols = list(syn.getTableColumns(target.id)) totalScope = target['scopeIds'] for s in scope: totalScope.append(s) # We need to preserve columns that are currently in the file view # but aren't automatically created when synapseclient.EntityViewSchema'ing. defaultCols = getDefaultColumnsForScope(syn, totalScope) defaultCols = [sc.Column(**c) for c in defaultCols] colNames = [c['name'] for c in cols] for c in defaultCols: # Preexisting columns have priority over defaults if c['name'] not in colNames: cols.append(c) schema = sc.EntityViewSchema(name=target.name, parent=target.parentId, columns=cols, scopes=totalScope, add_default_columns=False) schema = syn.store(schema) return schema
def createFileView(self, name, parent, scope, addCols=None, schema=None): """ Create and store a file view for further manipulation. Parameters ---------- name : str The name of the file view. parent : str Synapse ID of project to store file view within. scope : str or list Synapse IDs of items to include in file view. addCols : dict, list, or str Columns to add in addition to the default file view columns. schema : str or pandas.DataFrame A path to a .json file specifying a schema the file view should conform to -- or a pandas.DataFrame alreay in flattened format. (See `schema.flattenJson`). If `addCols` is a dict: Add keys as columns. If a key's value is `None`, then insert an empty column. Otherwise, set the `defaultValue` of the column to that value. After setting `self.view` to the pandas DataFrame version of the newly created file view, all rows in each column will be set to its `defaultValue` (unless there is no `defaultValue`, in which case the column will be empty). The file view will not be updated on Synapse until `self.publish` is called. If `addCols` is a list: Add columns to schema with no `defaultValue`. `self.view` will be unchanged from the file view that is stored on Synapse. If 'addCols is a str: Assumes the string is a filepath. Attempts to read in the filepath as a two-column .csv file, and then proceeds as if `addCols` was a dict, where the first column are the keys and the second column are the values. Returns ------- Synapse ID of newly created fileview. """ self.backup("createFileView") # Fetch default keys, plus any preexisting annotation keys cols = utils.getDefaultColumnsForScope(self.syn, scope) # Store flattened schema, add keys to active columns list. if self.schema is None: self.schema = (schemaModule.flattenJson(schema) if isinstance( schema, str) else schema) if self.schema is not None: for k in self.schema.index.unique(): self.addActiveCols(k) schemaCols = utils.makeColumns(list(self.schema.index.unique()), asSynapseCols=False) cols = self._getUniqueCols(schemaCols, cols) # Add keys defined during initialization if self._activeCols: activeCols = utils.makeColumns(self._activeCols, asSynapseCols=False) cols = self._getUniqueCols(activeCols, cols) # Add keys passed to addCols if addCols: if isinstance(addCols, dict): unspecifiedCols = [k for k in addCols if addCols[k] is None] self.addActiveCols(unspecifiedCols) elif isinstance(addCols, list): self.addActiveCols(addCols) newCols = utils.makeColumns(addCols, asSynapseCols=False) cols = self._getUniqueCols(newCols, cols) # Store columns to Synapse as EntityViewSchema. Default column values # are added to `self.view` but not yet stored to Synapse. cols = [sc.Column(**c) for c in cols] entityViewSchema = sc.EntityViewSchema(name=name, columns=cols, parent=parent, scopes=scope) self._entityViewSchema = self.syn.store(entityViewSchema) self.view = utils.synread(self.syn, self._entityViewSchema.id) self._index = self.view.index if isinstance(addCols, dict): self.addDefaultValues(addCols, False) return self._entityViewSchema.id
def main(syn): # Basic setup of the project project_name = "Testing Synapse Genie" # Determine the short and long names of the centers. center_abbreviations = ['AAA', 'BBB', 'CCC'] center_names = center_abbreviations # Create the project project = synapseclient.Project(project_name) project = syn.store(project) # Create a folder for log files generated by the GENIE processes # of validation and updating the database tables logs_folder = synapseclient.Folder(name='Logs', parent=project) logs_folder = syn.store(logs_folder) # Folder for individual center folders root_center_folder = synapseclient.Folder(name='Centers', parent=project) root_center_folder = syn.store(root_center_folder) # The folders for each center where they will upload files for validation # and submission. There is one folder per center. # This currently deviates from the original GENIE setup of having an # 'Input' and 'Staging' folder for each center. center_folders = [ synapseclient.Folder(name=name, parent=root_center_folder) for name in center_abbreviations ] center_folders = [syn.store(folder) for folder in center_folders] # Make some fake data that only contains basic text to check # for validation. n_files = 5 # number of files per center to create for folder in center_folders: for idx in range(n_files): tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}', suffix='.txt') with open(tmp.name, mode='w') as fh: fh.write(random.choice(['ERROR', 'VALID', 'NOPE'])) synfile = syn.store(synapseclient.File(tmp.name, parent=folder)) # Set up the table that holds the validation status of all submitted files. status_schema = create_status_table(syn, project) # Set up the table that maps the center abbreviation to the folder where # their data is uploaded. This is used by the GENIE framework to find the # files to validate for a center. center_map_table_defs = [ { 'name': 'name', 'columnType': 'STRING', 'maximumSize': 250 }, { 'name': 'center', 'columnType': 'STRING', 'maximumSize': 50 }, { 'name': 'inputSynId', 'columnType': 'ENTITYID' }, # {'name': 'stagingSynId', # 'columnType': 'ENTITYID'}, { 'name': 'release', 'defaultValue': 'false', 'columnType': 'BOOLEAN' } # {'id': '68438', # 'name': 'mutationInCisFilter', # 'defaultValue': 'true', # 'columnType': 'BOOLEAN', # 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'} ] center_map_cols = [ synapseclient.Column(**col) for col in center_map_table_defs ] center_schema = synapseclient.Schema(name='Center Table', columns=center_map_cols, parent=project) center_schema = syn.store(center_schema) # Add the center folders created above to this table. center_folder_ids = [folder.id for folder in center_folders] center_df = pandas.DataFrame( dict(name=center_names, center=center_abbreviations, inputSynId=center_folder_ids)) tbl = synapseclient.Table(schema=center_schema, values=center_df) tbl = syn.store(tbl) # Create a table that stores the error logs for each submitted file. error_col_defs = [ { 'name': 'id', 'columnType': 'ENTITYID' }, { 'name': 'center', 'columnType': 'STRING', 'maximumSize': 50, 'facetType': 'enumeration' }, { 'name': 'errors', 'columnType': 'LARGETEXT' }, { 'name': 'name', 'columnType': 'STRING', 'maximumSize': 500 }, # {'name': 'versionNumber', # 'columnType': 'STRING', # 'maximumSize': 50}, { 'name': 'fileType', 'columnType': 'STRING', 'maximumSize': 50 } ] error_map_cols = [synapseclient.Column(**col) for col in error_col_defs] error_schema = synapseclient.Schema(name='Error Table', columns=error_map_cols, parent=project) error_schema = syn.store(error_schema) # Create a table that maps the various database tables to a short name. # This table is used in many GENIE functions to find the correct table to update # or get the state of something from. db_map_col_defs = [{ 'name': 'Database', 'columnType': 'STRING', 'maximumSize': 50 }, { 'name': 'Id', 'columnType': 'ENTITYID' }] db_map_cols = [synapseclient.Column(**col) for col in db_map_col_defs] db_map_schema = synapseclient.Schema(name='DB Mapping Table', columns=db_map_cols, parent=project) db_map_schema = syn.store(db_map_schema) # Add dbMapping annotation project.annotations.dbMapping = db_map_schema.tableId project = syn.store(project) # Add the tables we already created to the mapping table. dbmap_df = pandas.DataFrame( dict(Database=[ 'centerMapping', 'validationStatus', 'errorTracker', 'dbMapping', 'logs' ], Id=[ center_schema.id, status_schema.id, error_schema.id, db_map_schema.id, logs_folder.id ])) db_map_tbl = synapseclient.Table(schema=db_map_schema, values=dbmap_df) db_map_tbl = syn.store(db_map_tbl) # Make a top level folder for output. Some processing for # file types copy a file from one place to another. output_folder = synapseclient.Folder(name='Output', parent=project) output_folder = syn.store(output_folder) output_folder_map = [] # default_table_col_defs = status_table_col_defs = [ # {'name': 'PRIMARY_KEY', # 'columnType': 'STRING'} # ] # default_table_cols = [synapseclient.Column(**col) # for col in default_table_col_defs] default_primary_key = 'PRIMARY_KEY' # For each file type format in the format registry, create an output folder and a table. # Some GENIE file types copy a file to a new place, and some update a table. Having both # means that both of these operations will be available at the beginning. # The mapping between the file type and the folder or table have a consistent naming. # The key ('Database' value) is {file_type}_folder or {file_type}_table. # Determine which file formats are going to be used. format_registry = config.collect_format_types(['example_registry']) for file_type, obj in format_registry.items(): file_type_folder = synapseclient.Folder(name=file_type, parent=output_folder) file_type_folder = syn.store(file_type_folder) output_folder_map.append( dict(Database=f"{file_type}_folder", Id=file_type_folder.id)) file_type_schema = synapseclient.Schema(name=file_type, parent=project) file_type_schema.annotations.primaryKey = default_primary_key file_type_schema = syn.store(file_type_schema) output_folder_map.append( dict(Database=f"{file_type}_table", Id=file_type_schema.id)) # Add the folders and tables created to the mapping table. db_map_tbl = synapseclient.Table( schema=db_map_schema, values=pandas.DataFrame(output_folder_map)) db_map_tbl = syn.store(db_map_tbl)
def test_create_and_update_file_view(): ## Create a folder folder = Folder(str(uuid.uuid4()), parent=project, description='creating a file-view') folder = syn.store(folder) ## Create dummy file with annotations in our folder path = utils.make_bogus_data_file() file_annotations = dict(fileFormat='jpg', dataType='image', artist='Banksy', medium='print', title='Girl With Ballon') schedule_for_cleanup(path) a_file = File(path, parent=folder, annotations=file_annotations) a_file = syn.store(a_file) schedule_for_cleanup(a_file) # Add new columns for the annotations on this file and get their IDs my_added_cols = [ syn.store(synapseclient.Column(name=k, columnType="STRING")) for k in file_annotations.keys() ] my_added_cols_ids = [c['id'] for c in my_added_cols] view_default_ids = [ c['id'] for c in syn._get_default_entity_view_columns('file') ] col_ids = my_added_cols_ids + view_default_ids scopeIds = [folder['id'].lstrip('syn')] ## Create an empty entity-view with defined scope as folder entity_view = EntityViewSchema(name=str(uuid.uuid4()), scopeIds=scopeIds, addDefaultViewColumns=True, addAnnotationColumns=False, type='file', columns=my_added_cols, parent=project) entity_view = syn.store(entity_view) schedule_for_cleanup(entity_view) assert_equals(set(scopeIds), set(entity_view.scopeIds)) assert_equals(set(col_ids), set(entity_view.columnIds)) assert_equals('file', entity_view.type) ## get the current view-schema view = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(view.filepath) view_dict = list( csv.DictReader(io.open(view.filepath, encoding="utf-8", newline=''))) # check that all of the annotations were retrieved from the view assert set(file_annotations.keys()).issubset(set(view_dict[0].keys())) updated_a_file = syn.get(a_file.id, downloadFile=False) # Check that the values are the same as what was set # Both in the view and on the entity itself for k, v in file_annotations.items(): assert_equals(view_dict[0][k], v) assert_equals(updated_a_file.annotations[k][0], v) # Make a change to the view and store view_dict[0]['fileFormat'] = 'PNG' with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) temp_filename = temp.name with io.open(temp_filename, mode='w', encoding="utf-8", newline='') as temp_file: dw = csv.DictWriter(temp_file, fieldnames=view_dict[0].keys(), quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) dw.writeheader() dw.writerows(view_dict) temp_file.flush() new_view = syn.store(synapseclient.Table(entity_view.id, temp_filename)) new_view_dict = list( csv.DictReader(io.open(temp_filename, encoding="utf-8", newline=''))) assert_equals(new_view_dict[0]['fileFormat'], 'PNG') #query for the change start_time = time.time() new_view_results = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(new_view_results.filepath) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) #query until change is seen. while new_view_dict[0]['fileFormat'] != 'PNG': #check timeout assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) #query again new_view_results = syn.tableQuery("select * from %s" % entity_view.id) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) #paranoid check assert_equals(new_view_dict[0]['fileFormat'], 'PNG')
def create_cols(table_type, syn=None): if table_type == MC10_SENSOR_NAME: cols = [sc.Column(name="measurement_id", columnType="STRING"), sc.Column(name="sensor_location", columnType="STRING"), sc.Column(name="mc10_accelerometer", columnType="FILEHANDLEID"), sc.Column(name="mc10_gyroscope", columnType="FILEHANDLEID"), sc.Column(name="mc10_emg", columnType="FILEHANDLEID")] elif table_type == SMARTWATCH_SENSOR_NAME: cols = [sc.Column(name="measurement_id", columnType="STRING"), sc.Column(name="smartwatch_accelerometer", columnType="FILEHANDLEID")] elif table_type == "diary": cols = list(syn.getTableColumns(DIARY)) cols = [sc.Column(name="measurement_id", columnType="STRING"), sc.Column(name="subject_id", columnType="INTEGER"), sc.Column(name="timestamp", columnType="DATE"), sc.Column(name="activity_intensity", columnType="INTEGER"), sc.Column(name="dyskinesia", columnType="INTEGER"), sc.Column(name="on_off", columnType="INTEGER"), sc.Column(name="tremor", columnType="INTEGER"), sc.Column(name="activity_intensity_reported_timestamp", columnType="DATE"), sc.Column(name="dyskinesia_reported_timestamp", columnType="DATE"), sc.Column(name="on_off_reported_timestamp", columnType="DATE"), sc.Column(name="tremor_reported_timestamp", columnType="DATE")] else: raise TypeError("table_type must be one of [{}, {}, {}]".format( MC10_SENSOR_NAME, SMARTWATCH_SENSOR_NAME, "diary")) return cols
def test_migrate_project(request, syn, schedule_for_cleanup, storage_location_id): test_name = request.node.name project_name = "{}-{}".format(test_name, uuid.uuid4()) project = synapseclient.Project(name=project_name) project_entity = syn.store(project) file_0_path = _create_temp_file() schedule_for_cleanup(file_0_path) file_0_name = "{}-{}".format(test_name, 1) file_0 = synapseclient.File(name=file_0_name, path=file_0_path, parent=project_entity) file_0_entity = syn.store(file_0) default_storage_location_id = file_0_entity._file_handle[ 'storageLocationId'] folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4()) folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name) folder_1_entity = syn.store(folder_1) file_1_path = _create_temp_file() schedule_for_cleanup(file_1_path) file_1_name = "{}-{}".format(test_name, 1) file_1 = synapseclient.File(name=file_1_name, path=file_1_path, parent=folder_1_entity) file_1_entity = syn.store(file_1) file_2_path = _create_temp_file() schedule_for_cleanup(file_2_path) file_2_name = "{}-{}".format(test_name, 2) file_2 = synapseclient.File(name=file_2_name, path=file_2_path, parent=folder_1_entity) file_2_entity = syn.store(file_2) # file 3 shares the same file handle id as file 1 file_3_path = file_1_path file_3_name = "{}-{}".format(test_name, 3) file_3 = synapseclient.File(name=file_3_name, path=file_3_path, parent=folder_1_entity) file_3.dataFileHandleId = file_1_entity.dataFileHandleId file_3_entity = syn.store(file_3) table_1_cols = [ synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'), synapseclient.Column(name='num', columnType='INTEGER'), synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'), ] table_1 = syn.store( synapseclient.Schema(name=test_name, columns=table_1_cols, parent=folder_1_entity)) table_1_file_col_1_1 = _create_temp_file() table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1) table_1_file_col_1_2 = _create_temp_file() table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1) table_1_file_col_2_1 = _create_temp_file() table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1) table_1_file_col_2_2 = _create_temp_file() table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1) data = [ [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']], [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']], ] table_1_entity = syn.store( synapseclient.RowSet(schema=table_1, rows=[synapseclient.Row(r) for r in data])) db_path = tempfile.NamedTemporaryFile(delete=False).name schedule_for_cleanup(db_path) index_result = synapseutils.index_files_for_migration( syn, project_entity, storage_location_id, db_path, file_version_strategy='new', include_table_files=True, ) counts_by_status = index_result.get_counts_by_status() assert counts_by_status['INDEXED'] == 8 assert counts_by_status['ERRORED'] == 0 migration_result = synapseutils.migrate_indexed_files(syn, db_path, force=True) file_0_entity_updated = syn.get(utils.id_of(file_0_entity), downloadFile=False) file_1_entity_updated = syn.get(utils.id_of(file_1_entity), downloadFile=False) file_2_entity_updated = syn.get(utils.id_of(file_2_entity), downloadFile=False) file_3_entity_updated = syn.get(utils.id_of(file_3_entity), downloadFile=False) file_handles = [ f['_file_handle'] for f in ( file_0_entity_updated, file_1_entity_updated, file_2_entity_updated, file_3_entity_updated, ) ] table_1_id = utils.id_of(table_1_entity) results = syn.tableQuery("select file_col_1, file_col_2 from {}".format( utils.id_of(table_1_entity))) table_file_handles = [] for row in results: for file_handle_id in row[2:]: file_handle = syn._getFileHandleDownload( file_handle_id, table_1_id, objectType='TableEntity')['fileHandle'] table_file_handles.append(file_handle) file_handles.extend(table_file_handles) _assert_storage_location(file_handles, storage_location_id) assert storage_location_id != default_storage_location_id with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query_result = cursor.execute( "select status, count(*) from migrations where type in (?, ?) group by status", (_MigrationType.FILE.value, _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall() counts = {r[0]: r[1] for r in query_result} # should only be one status and they should all be migrated # should be 3 migrated files entities + 4 migrated table attached files assert len(counts) == 1 assert counts[_MigrationStatus.MIGRATED.value] == 8 csv_file = tempfile.NamedTemporaryFile(delete=False) schedule_for_cleanup(csv_file.name) migration_result.as_csv(csv_file.name) with open(csv_file.name, 'r') as csv_file_in: csv_contents = csv_file_in.read() table_1_id = table_1_entity['tableId'] # assert the content of the csv. we don't assert any particular order of the lines # but the presence of the expected lines and the correct # of lines csv_lines = csv_contents.split('\n') assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines # noqa assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines # noqa assert "" in csv_lines # expect trailing newline in a csv
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) # Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent, # but faster than chunked queries. time.sleep(2) # Test recursive get run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) # Test query get using a Table with an entity column # This should be replaced when Table File Views are implemented in the client cols = [synapseclient.Column(name='id', columnType='ENTITYID')] schema1 = syn.store( synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [[x.id] for x in file_entities] syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) time.sleep(3) # get -q are eventually consistent # Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) #Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) #function under test uses queries which are eventually consistent but not immediately after creating the entities start_time = time.time() while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) ### Test recursive get output = run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) #Verify that we downloaded files: new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) ### Test query get ### Note: We're not querying on annotations because tests can fail if there ### are lots of jobs queued as happens when staging is syncing output = run('synapse', '--skip-checks', 'get', '-q', "select id from file where parentId=='%s'" % folder_entity2.id) #Verify that we downloaded files from folder_entity2 new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0]) ### Test query get using a Table with an entity column ### This should be replaced when Table File Views are implemented in the client cols = [] cols.append(synapseclient.Column(name='id', columnType='ENTITYID')) schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 =[[x.id] for x in file_entities] print(data1) row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1, rows=[synapseclient.Row(r) for r in data1])) ### Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) #Verify that we downloaded files: new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])
def main(): import argparse parser = argparse.ArgumentParser( description='Convert JSON to Synapse Table Schema') parser.add_argument('path', type=str, help='Path (or URL) to JSON file') parser.add_argument('--projectId', type=str, help='Synapse Project ID to store schema') parser.add_argument('-n', '--dry_run', action="store_true", default=False, help='Dry run') parser.add_argument('--synapseJSONSchema', action="store_true", default=False, help="JSON is already in Synapse Table Schema format") args = parser.parse_args() syn = synapseclient.login(silent=True) project = syn.get(args.projectId) f = urllib.urlopen(path2url(args.path)) data = json.load(f) url_path = urllib.splittype(args.path)[1] filename = os.path.split(url_path)[1] schema_name = os.path.splitext(filename)[0] if args.synapseJSONSchema: schema = synapseclient.Schema(name=schema_name, parent=project) schema.columns_to_store = data else: cols = [] for k, v in data.iteritems(): # Handle null values, assume that they will be strings if not v: column_type = "STRING" elif bool in map(type, v): column_type = "BOOLEAN" elif int in map(type, v): column_type = "INTEGER" elif float in map(type, v): column_type = "DOUBLE" else: column_type = "STRING" cols.append( synapseclient.Column(name=k, columnType=column_type, enumValues=v, maximumSize=250)) schema = synapseclient.Schema(name=schema_name, columns=cols, parent=project) if args.dry_run: schema_as_list = map(dict, schema.columns_to_store) new_schema_as_list = [] _key_order = [ 'name', 'description', 'columnType', 'maximumSize', 'enumValues' ] for col in schema_as_list: col['description'] = "" col['source'] = "" new_enum_values = [] for v in col['enumValues']: new_value_ordered_dict = collections.OrderedDict() new_value_ordered_dict['value'] = v new_value_ordered_dict['description'] = "" new_value_ordered_dict['source'] = "" new_enum_values.append(new_value_ordered_dict) col['enumValues'] = new_enum_values new_ordered_dict = collections.OrderedDict() for k in _key_order: new_ordered_dict[k] = col[k] new_schema_as_list.append(new_ordered_dict) print json.dumps(new_schema_as_list, indent=2) else: schema = syn.store(schema)
def store_tables(syn, raw_data_curated, scores_curated, meds_curated, sleep_curated, feedback_curated): # sensor measurements raw_data_cols = [ sc.Column(name="subject_id", columnType="STRING", maximumSize=6), sc.Column(name="device", columnType="STRING", maximumSize=10), sc.Column(name="participant_day", columnType="INTEGER"), sc.Column(name="timestamp_start", columnType="DOUBLE"), sc.Column(name="timestamp_end", columnType="DOUBLE"), sc.Column(name="source_file", columnType="ENTITYID"), sc.Column(name="data_file_handle_id", columnType="FILEHANDLEID") ] raw_data_schema = sc.Schema(name="Sensor Measurements", columns=raw_data_cols, parent=PROJECT) raw_data_table = sc.Table(raw_data_schema, raw_data_curated) syn.store(raw_data_table) # task scores scores_curated_table = sc.table.build_table("Task Scores", PROJECT, scores_curated) syn.store(scores_curated_table) # medication diary meds_cols = [ sc.Column(name="subject_id", columnType="STRING", maximumSize=6), sc.Column(name="timestamp", columnType="INTEGER"), sc.Column(name="pd_related_medications", columnType="STRING", maximumSize=120), sc.Column(name="other_medications", columnType="STRING", maximumSize=120) ] meds_curated_clean = clean_numeric_cols(meds_curated, ["timestamp"]) meds_schema = sc.Schema(name="Medication Diary", columns=meds_cols, parent=PROJECT) meds_table = sc.Table(meds_schema, meds_curated_clean) syn.store(meds_table) # sleep diary sleep_cols = [ sc.Column(name="subject_id", columnType="STRING", maximumSize=6), sc.Column(name="sleep", columnType="INTEGER"), sc.Column(name="wake", columnType="INTEGER") ] sleep_curated_clean = clean_numeric_cols(sleep_curated, ["sleep", "wake"]) sleep_schema = sc.Schema(name="Sleep Diary", columns=sleep_cols, parent=PROJECT) sleep_table = sc.Table(sleep_schema, sleep_curated_clean) syn.store(sleep_table) # feedback survey feedback_cols = [ sc.Column(name="subject_id", columnType="STRING", maximumSize=6), sc.Column(name="charge_smartphone", columnType="INTEGER"), sc.Column(name="charge_pebble", columnType="INTEGER"), sc.Column(name="experience_watches", columnType="INTEGER"), sc.Column(name="experience_devices", columnType="INTEGER"), sc.Column(name="clearness_diary", columnType="INTEGER"), sc.Column(name="accuracy_diary", columnType="INTEGER"), sc.Column(name="additional_feedback_device_phone", columnType="LARGETEXT"), sc.Column(name="additional_feedback_diary", columnType="LARGETEXT"), sc.Column(name="additional_feedback_experiment", columnType="LARGETEXT") ] feedback_curated_clean = clean_numeric_cols(feedback_curated, [ "charge_smartphone", "charge_pebble", "charge_pebble", "experience_watches", "experience_devices", "clearness_diary", "accuracy_diary" ]) feedback_schema = sc.Schema(name="Feedback Survey", columns=feedback_cols, parent=PROJECT) feedback_table = sc.Table(feedback_schema, feedback_curated_clean) syn.store(feedback_table)