def updateDatabase(database, new_dataset, databaseSynId, checkBy): """ Updates synapse tables by a row identifier with another dataset that has the same number and order of columns :param database: The synapse table (pandas dataframe) :param new_dataset: New dataset (pandas dataframe) :param databaseSynId Synapse Id of the database table :param checkBy: Column to compare both datasets by :returns: Don't know yet """ updatedSet = database.apply(lambda x: _updateRows(x, new_dataset, checkBy),axis=1) updatedSet = updatedSet[~updatedSet[checkBy].isnull()] #All new rows newSet = new_dataset[~new_dataset[checkBy].isin(database[checkBy])] #All deleted rows (This assumes that all data that don't show up in the new uploaded data should be deleted...) deleteSets = database[~database[checkBy].isin(new_dataset[checkBy])] print(updatedSet.empty) print(newSet.empty) print(deleteSets.empty) if not deleteSets.empty: deleteRows = syn.delete(Table(syn.get(databaseSynId), deleteSets)) else: print("No deleted rows") #updatedSet = updatedSet.append(newSet) if not updatedSet.empty: table = syn.store(Table(syn.get(databaseSynId), updatedSet)) else: print("No updated rows") if not newSet.empty: table = syn.store(Table(syn.get(databaseSynId), newSet)) else: print("No new rows")
def process_new_table(args, syn): """ Function: process_new_table Purpose: Create an annotations table with the specified name under the specified Synapse parent ID using the specified JSON schema. This function is called when the "new_table" option is specified when the program is called. Arguments: JSON schema file reference Synapse parent ID Synapse table name A Synapse client object """ # Define column names for the synapse table. dcc_column_names = [ Column(name="key", columnType="STRING", maximumSize=100), Column(name="description", columnType="STRING", maximumSize=250), Column(name="columnType", columnType="STRING", maximumSize=50), Column(name="maximumSize", columnType="DOUBLE"), Column(name="value", columnType="STRING", maximumSize=250), Column(name="valueDescription", columnType="LARGETEXT"), Column(name="source", columnType="STRING", maximumSize=250), Column(name="module", columnType="STRING", maximumSize=100) ] syn_table_df = process_schema(args.json_schema_file) # Build and populate the Synapse table. table_schema = Schema(name=args.synapse_table_name, columns=dcc_column_names, parent=args.parent_synapse_id) dcc_table = syn.store(Table(table_schema, syn_table_df))
def __syn_store(self, data): """store data to Synapse Notes: Synapse frequently encounters SSL and other connection errors. This method will retry the push however many times are defined in the application config setting SYNAPSE_RETRIES. Sleeps three seconds between attempts Args: data: (dict) should match SYN_SCHEMA defined above Returns: None """ retries = secrets.SYNAPSE_RETRIES while retries > 0: try: syn.store(Table(SYN_SCHEMA, data)) retries = 0 except SSLError: pass except SynapseHTTPError: pass except Exception as e: add_log_entry( f'consent failed to push to Synapse with <{str(e)}>', self.internal_id) retries = 0 retries -= 1 time.sleep(3)
def _copyTable(syn, entity, destinationId, updateExisting=False): """ Copies synapse Tables :param entity: A synapse ID of Table Schema :param destinationId: Synapse ID of a project that the Table wants to be copied to :param updateExisting: Can choose to update files that have the same name Default to False """ print("Getting table %s" % entity) myTableSchema = syn.get(entity) # CHECK: If Table name already exists, raise value error existingEntity = syn.findEntityId(myTableSchema.name, parent=destinationId) if existingEntity is not None: raise ValueError( 'An entity named "%s" already exists in this location. Table could not be copied' % myTableSchema.name) d = syn.tableQuery('select * from %s' % myTableSchema.id, includeRowIdAndRowVersion=False) colIds = myTableSchema.columnIds newTableSchema = Schema(name=myTableSchema.name, parent=destinationId, columns=colIds) print("Created new table using schema %s" % newTableSchema.name) newTable = Table(schema=newTableSchema, values=d.filepath) newTable = syn.store(newTable) return newTable.schema.id
def test_tables_csv(syn, project): # Define schema cols = [ Column(name='Name', columnType='STRING'), Column(name='Born', columnType='INTEGER'), Column(name='Hipness', columnType='DOUBLE'), Column(name='Living', columnType='BOOLEAN') ] schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] # the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) # Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) # Test that CSV file came back as expected for expected_row, row in zip(data, results): assert expected_row == row, "expected %s but got %s" % (expected_row, row)
def add_new_rows_to_table(df, replace_table=False, dry_run=False): """Add rows for synapse IDs not already represented in the table or replace the whole table""" schema = syn.get(TABLE_SYNAPSE_ID) if replace_table: ## delete previous entries in pilot-63-progress table results = syn.tableQuery('select * from %s' % utils.id_of(schema), resultsAs='rowset') if not dry_run: syn.delete(results) else: results = syn.tableQuery('select synapse_id from %s' % utils.id_of(schema), includeRowIdAndRowVersion=False) synapse_ids = [row[0] for row in results] df = df[[ synapse_id not in synapse_ids for synapse_id in df['synapse_id'] ]] if df.shape[0] > 0: if dry_run: print "Dry run: would have added %d rows to pilot-63-progress table" % df.shape[ 0] else: print "Adding %d rows to pilot-63-progress table" % df.shape[0] syn.store(Table(schema, df)) return df.shape[0] else: print "No new rows for pilot-63-progress table" return None
def upload(directory, synID, synName, dataFrameList): """ Upload the data to a Synapse table Input: directory: The name of the directory holding the data synID: Synapse ID of the project where the table will be stored synName: Name to be given to the new table dataFrameList: List of dataframes with all of the data """ df = pd.DataFrame() print("Creating dataframe") for entry in dataFrameList: df = df.append(entry, ignore_index=True) # Each of these columns are longer than 1000 characters each. # Cut them down to 1000 chars max df = df.applymap(lambda x: str(x)[:1000]) print("Writing to file") df.to_csv('%s/allData.csv' % directory, encodings='utf-8', index=False) print("Uploading to Synapse") schema = Schema(name=synName, columns=as_table_columns(df), parent=synID) syn.store(Table(schema, df))
def write_synapse_table(table_data, synapse_project_id, table_name='', username='', password=''): """ Write data to a Synapse table. Parameters ---------- table_data : Pandas DataFrame Synapse table contents synapse_project_id : string Synapse ID for project within which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> import os >>> import pandas as pd >>> from mhealthx.xio import write_synapse_table >>> path = os.environ['MHEALTHX_OUTPUT'] >>> table = os.path.join(path, 'feature_tables', ... 'tap_row0_v0_9d44a388-5d7e-4271-8705-2faa66204486.csv') >>> table_data = pd.read_csv(table) >>> username = '' >>> password = '' >>> synapse_project_id = 'syn4899451' >>> table_name = 'Contents of table' >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password) """ import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse(skip_checks=True) # Log in to Synapse: if username and password: syn.login(username, password, silent=True) else: syn.login(silent=True) #table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) syn.store(Table(schema, table_data))
def test_store_table_datetime(syn, project): current_datetime = datetime.fromtimestamp(round(time.time(), 3)) schema = syn.store( Schema("testTable", [Column(name="testerino", columnType='DATE')], project)) rowset = RowSet(rows=[Row([current_datetime])], schema=schema) syn.store(Table(schema, rowset)) query_result = syn.tableQuery("select * from %s" % utils.id_of(schema), resultsAs="rowset") assert current_datetime == query_result.rowset['rows'][0]['values'][0]
def test_synapse_integer_columns_with_missing_values_from_dataframe( syn, project, schedule_for_cleanup): # SYNPY-267 cols = [ Column(name='x', columnType='STRING'), Column(name='y', columnType='INTEGER'), Column(name='z', columnType='DOUBLE') ] schema = syn.store(Schema(name='Big Table', columns=cols, parent=project)) line_terminator = str(os.linesep) # write rows to CSV file with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) # 2nd row is missing a value in its integer column temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator + 'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator) temp.flush() filename = temp.name # create a table from csv table = Table(schema, filename) df = table.asDataFrame() table_from_dataframe = Table(schema, df) assert table.filepath != table_from_dataframe.filepath df2 = table_from_dataframe.asDataFrame() assert_frame_equal(df, df2)
def build_synapse_table(): """build the table in Synapse to match the schema defined above""" table = Table(SYN_SCHEMA, values=[BLANK_CONSENT]) table = syn.store(table) results = syn.tableQuery("select * from %s where study_id = '%s'" % (table.tableId, BLANK_CONSENT[0])) syn.delete(results) syn.setProvenance(entity=table.tableId, activity=synapseclient.Activity( name='Created', description='Table generated by gTap.'))
def test_tables_pandas(syn, project): # create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) # store in Synapse table = syn.store(Table(schema, df)) # retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) # simulate rowId-version rownames for comparison df.index = ['%s_1' % i for i in range(1, 6)] df['string_'] = df['string_'].transform(str) # SYNPY-717 df['datetime64'] = df['datetime64'].apply( lambda x: pd.Timestamp(x).tz_localize('UTC')) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; # second .all() gives a bool that is ANDed value of that Series assert_frame_equal(df2, df)
def process_overwrite_table(args, syn): """ Function: process_overwrite_table Purpose: Overwrite the specified annotations table with data contained in the specified JSON schema. This function is called when the "overwrite_table" option is specified when the program is called. Arguments: JSON schema file reference Synapse ID of the table to be overwritten A Synapse client object """ syn_table_df = process_schema(args.json_schema_file) # Delete the old records from the Synapse table and then write out the # new ones. dcc_val_table = syn.get(args.table_synapse_id) results = syn.tableQuery(f"select * from {dcc_val_table.id}") delete_out = syn.delete(results) table_out = syn.store(Table(dcc_val_table.id, syn_table_df))
def countAndUpdateTable(input, tableId): i, fileMeta = input print 'updating table:%s' %tableId, 'with file %s(%s)' %(fileMeta['name'], fileMeta.id), fileMeta['basename'] ent = syn.get(fileMeta.id) if fileMeta.fileType =='bed5': data = pd.read_csv(ent.path, sep='\t') nFeatures = 0 samples = list(set(data.Sample.dropna())) else: #All other fileTypes data = pd.read_csv(ent.path, sep='\t', index_col=0) nFeatures, nSamples = data.shape samples = data.columns metadata = pd.DataFrame([fileMeta]*len(samples)) metadata['nFeatures'] = nFeatures metadata['samples'] = samples metadata['patient_barcode'] = [x[:12] for x in metadata.samples] metadata.drop(['tissue', u'md5', u'assembly'], axis=1, inplace=True) metadata.nFeatures = metadata.nFeatures.astype('int') cols = syn.tableQuery('select * from %s limit 1' %args.tableId).asDataFrame().columns #Update rows in table print 'adding', metadata.shape[0] t = syn.store(Table(tableId, metadata[cols])) return metadata
def update_global_scores_table(global_data): import challenge_config as config from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns # 'principalId', 'name', 'score_lb', 'score_mean', 'score_ub', 'rank' cols = [ Column(name='UserID', columnType='STRING', maximumSize=100), Column(name='Name', columnType='STRING', maximumSize=100), Column(name='score_lb', columnType='DOUBLE'), Column(name='score_mean', columnType='DOUBLE'), Column(name='score_ub', columnType='DOUBLE'), Column(name='rank', columnType='DOUBLE'), ] schema = Schema(name='Global Scores', columns=cols, parent=config.CHALLENGE_SYN_ID) results = syn.tableQuery("select * from {}".format('syn7237020')) if len(results) > 0: a = syn.delete(results.asRowSet()) table = syn.store(Table(schema, global_data)) results = syn.tableQuery("select * from {}".format(table.tableId)) for row in results: print row return
def test_create_and_update_file_view(syn, project, schedule_for_cleanup): # Create a folder folder = Folder(str(uuid.uuid4()), parent=project, description='creating a file-view') folder = syn.store(folder) # Create dummy file with annotations in our folder path = utils.make_bogus_data_file() file_annotations = dict(fileFormat='jpg', dataType='image', artist='Banksy', medium='print', title='Girl With Ballon') schedule_for_cleanup(path) a_file = File(path, parent=folder, annotations=file_annotations) a_file = syn.store(a_file) schedule_for_cleanup(a_file) # Add new columns for the annotations on this file and get their IDs my_added_cols = [ syn.store(Column(name=k, columnType="STRING")) for k in file_annotations.keys() ] my_added_cols_ids = [c['id'] for c in my_added_cols] view_default_ids = [ c['id'] for c in syn._get_default_view_columns( "entityview", EntityViewType.FILE.value) ] col_ids = my_added_cols_ids + view_default_ids scopeIds = [folder['id'].lstrip('syn')] # Create an empty entity-view with defined scope as folder entity_view = EntityViewSchema(name=str(uuid.uuid4()), scopeIds=scopeIds, addDefaultViewColumns=True, addAnnotationColumns=False, type='file', columns=my_added_cols, parent=project) entity_view = syn.store(entity_view) schedule_for_cleanup(entity_view) assert set(scopeIds) == set(entity_view.scopeIds) assert set(col_ids) == set(entity_view.columnIds) assert EntityViewType.FILE.value == entity_view.viewTypeMask # get the current view-schema view = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(view.filepath) view_dict = list( csv.DictReader(io.open(view.filepath, encoding="utf-8", newline=''))) # check that all of the annotations were retrieved from the view assert set(file_annotations.keys()).issubset(set(view_dict[0].keys())) updated_a_file = syn.get(a_file.id, downloadFile=False) # Check that the values are the same as what was set # Both in the view and on the entity itself for k, v in file_annotations.items(): assert view_dict[0][k] == v assert updated_a_file.annotations[k][0] == v # Make a change to the view and store view_dict[0]['fileFormat'] = 'PNG' with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) temp_filename = temp.name with io.open(temp_filename, mode='w', encoding="utf-8", newline='') as temp_file: dw = csv.DictWriter(temp_file, fieldnames=view_dict[0].keys(), quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) dw.writeheader() dw.writerows(view_dict) temp_file.flush() syn.store(Table(entity_view.id, temp_filename)) new_view_dict = list( csv.DictReader(io.open(temp_filename, encoding="utf-8", newline=''))) assert new_view_dict[0]['fileFormat'] == 'PNG' # query for the change start_time = time.time() new_view_results = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(new_view_results.filepath) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) # query until change is seen. while new_view_dict[0]['fileFormat'] != 'PNG': # check timeout assert time.time() - start_time < QUERY_TIMEOUT_SEC # query again new_view_results = syn.tableQuery("select * from %s" % entity_view.id) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) # paranoid check assert new_view_dict[0]['fileFormat'] == 'PNG'
def process_rows(syn, voice_table_id=VOICE_TABLE, last_row_version=None, limit=None, offset=None, data_columns=DATA_COLUMNS, completed=None, out_files=None, opensmile_conf=OPENSMILE_CONF_PATH, cleanup=True, append=True): """ Perform audio feature extraction for rows in a Synapse table containing file references. For each .m4a audio file: 1. convert to .wav with ffmpeg 2. extract features using openSMILE 3. accumulate metadata and features in an output .csv file Parameters: syn: connection to Synapse voice_table_id (str): Synapse ID of voice table. last_row_version: Compute features only for rows whose version is greater than the oven given. Enables processing rows added to the source table since the script was last run. limit: Maximum number of rows to process offset: Process rows starting from an offset from the 1st row data_columns (list of str): Column names holding file handles completed (list of str): Optional paths to files holding completed feature data (parallel to data_columns) out_files (list of str): Paths to files into which to write feature data (parallel to data_columns) opensmile_conf (str): Path to opensmile configuration file cleanup (bool): remove temporary files append (bool): append rows to output files Notes: The voice table is expected to hold metadata columns and fileHandles to raw voice data from the mPower app in m4a format. For each column in data_columns, the program will produce one output file, which will have one row of metadata and features for each row in the source table. """ ##---------------------------------------------------------- ## Query source table with limit and offset ##---------------------------------------------------------- query = "SELECT {cols} FROM {table_id}".format(cols=','.join( double_quote(KEEP_COLUMNS + data_columns)), table_id=voice_table_id) if last_row_version: query += " WHERE ROW_VERSION > {0}".format(last_row_version) if limit: query += " LIMIT {0}".format(limit) if offset: query += " OFFSET {0}".format(offset) results = syn.tableQuery(query) ##---------------------------------------------------------- ## load results as a DataFrame, but specify string dtype for FileHandleId ## columns so Pandas doesn't infer their type as int64 or float (with nans ## for missing values). ##---------------------------------------------------------- df = pd.read_csv(results.filepath, dtype={col: 'string' for col in data_columns}) df.index = [ "%s_%s" % (id, version) for id, version in zip(df["ROW_ID"], df["ROW_VERSION"]) ] del df["ROW_ID"] del df["ROW_VERSION"] ##---------------------------------------------------------- ## don't redo rows for which all data columns have ## already been processed ##---------------------------------------------------------- completed_dfs = {} if completed: completed_rows = pd.Series(True, index=df.index) for i, column in enumerate(data_columns): ## read .csv completed_dfs[column] = pd.read_csv( completed[i], dtype={col: 'string' for col in data_columns}) ## fix calculatedMeds column name completed_dfs[column] = completed_dfs[column].rename( columns={'medTimepoint': 'calculatedMeds'}) ## reorder metadata columns to match query results, ## assuming completed dfs have metadata columns similar to: ## recordId, createdOn, appVersion, medTimepoint, ROW_VERSION, ## healthCode, phoneInfo, ROW_ID, audio_countdown.m4a, audio_audio.m4a, ## and cols 10:72 are the 62 GeMAPS features computed by openSMILE ## starting with 'F0semitoneFrom27.5Hz_sma3nz_amean' column_index = df.columns.append( completed_dfs[column].columns[10:72]) completed_dfs[column] = completed_dfs[column][column_index] ## track rows with all data columns completed completed_rows = completed_rows & df.recordId.isin( completed_dfs[column].recordId) else: completed_rows = pd.Series(False, index=df.index) df_to_download = df.ix[~completed_rows, :] ##---------------------------------------------------------- ## Bulk download audio data in .m4a format ##---------------------------------------------------------- file_map = syn.downloadTableColumns(Table(results.tableId, df_to_download), data_columns) ##---------------------------------------------------------- ## unix time stamps -> nicely formated dates ##---------------------------------------------------------- df.createdOn = df.createdOn.apply(utils.from_unix_epoch_time) ##---------------------------------------------------------- ## process audio files ##---------------------------------------------------------- for i in range(df.shape[0]): row = df.iloc[[i], :] print "processing:", i, row['recordId'].values[0] for column, out_file in zip(data_columns, out_files): ## check if we've already processed this record if completed_rows[i]: out_row = completed_dfs[column].ix[ completed_dfs[column].recordId == df.recordId[i], :] print "already computed!" else: file_handle_id = df.ix[i, column] ## Pandas represents missing values as nan if isinstance(file_handle_id, float): if math.isnan(file_handle_id): continue try: filepath = file_map[file_handle_id] except KeyError as ex2: print 'No file path for file handle id "%s".' % file_handle_id continue try: ##---------------------------------------------------------- ## convert to wav ##---------------------------------------------------------- basename, ext = os.path.splitext( os.path.basename(filepath)) wave_file = basename + ".wav" if os.path.exists(wave_file): os.remove(wave_file) command = "ffmpeg -i {infile} -ac 2 {outfile}".format( infile=filepath, outfile=wave_file) output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) ##---------------------------------------------------------- ## extract features with openSMILE ## example: SMILExtract -I output.wav -C ./openSMILE-2.1.0/config/gemaps/GeMAPSv01a.conf --csvoutput features.csv ##---------------------------------------------------------- features_file = basename + ".csv" if os.path.exists(features_file): os.remove(features_file) command = "SMILExtract -I {input_file} -C {conf_file} --csvoutput {output_file}".format( input_file=wave_file, conf_file=opensmile_conf, output_file=features_file) output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) ##---------------------------------------------------------- ## merge metadata and features ##---------------------------------------------------------- features = pd.read_csv(features_file, sep=';', index_col=None) ## get rid of useless column features.drop('name', axis=1, inplace=True) ## force the indexes to be equal so they will concat into 1 row. WTF, pandas? features.index = row.index out_row = pd.concat((row, features), axis=1) except Exception as ex1: try: sys.stderr.write( "~~>Exception while processing record:{record}\n". format(record=row['recordId'])) except Exception as ex2: sys.stderr.write( "~~~>Exception while processing record.\n") finally: if cleanup: ## opensmile want to output an .arff file whether you ask it to or not. Worse ## yet, it appends, so it keeps growing. Let's clean it up. opensmile_arff_file = "output.arff" for v in [ 'wave_file', 'features_file', 'opensmile_arff_file' ]: try: if v in locals() and os.path.exists( locals()[v]): os.remove(locals()[v]) except Exception as ex: sys.stderr.write( 'Error cleaning up temp files: ') sys.stderr.write(str(ex)) sys.stderr.write('\n') ## append row to output .csv file append = (append or i > 0) with open(out_file, 'a' if append else 'w') as f: out_row.to_csv(f, header=(not append), index=False, quoting=csv.QUOTE_NONNUMERIC) print "processing rows complete!"
if not oldValues.empty: if not all([old == new for old, new in zip(oldValues.values[0], newValues)]): projectTrackerDf[projectTrackerDf['projectEntity'] == synId] = newValues else: removeSamples.append(synId) else: projectTrackerDf = projectTrackerDf.append(pd.DataFrame([newValues],columns=['projectEntity','numberOfFiles','numberOfContributors','lateModified','Active'])) newUploads = projectTrackerDf[~projectTrackerDf['projectEntity'].isin(removeSamples)] if not newUploads.empty: newUploads['lateModified'] = newUploads['lateModified'].apply(int) newUploads['numberOfFiles'] = newUploads['numberOfFiles'].apply(int) newUploads['numberOfContributors'] = newUploads['numberOfContributors'].apply(int) newUploads['lateModified'][newUploads['lateModified'] == 0] = "" schema = syn.get(projectUploadActivitySynId) tablestore = Table(schema, newUploads, etag=projectTracker.etag) tablestore = syn.store(tablestore) else: print("No updates!") #Table 2: Files by assay type #Assay Type -- Number of Files -- Number of Cell Lines #> assay, grab number of unique synapseid, sampleIdentifier #https://www.synapse.org/#!Synapse:syn4939478/wiki/411658 ntap_generated_data_synId = "syn7805078" ntap_generated_data = syn.tableQuery('SELECT * FROM %s' % ntap_generated_data_synId) ntap_generated_data_df = ntap_generated_data.asDataFrame() annot_synIds = ["syn7506024","syn7805075","syn7992153"] assaysNumSynId = {}
def opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password): """ Save openSMILE's SMILExtract audio features to a Synapse table. Parameters ---------- in_files : list of strings full path to the input files synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table table_name : string schema name of table synapse_table_id : string Synapse table ID Examples -------- >>> from mhealthx.features import opensmile_features_to_synapse >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Phonation openSMILE feature table' >>> username = '' >>> password = '' >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns from mhealthx.io_data import concatenate_tables_to_synapse_table as cat syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store each file as a row in a Synapse table: first = True for in_file in in_files: if first: df_data = pd.read_csv(in_file) first = False else: df_data = pd.read_csv(in_file) table_data, project_id = cat(frames, synapse_project_id, table_name, username, password) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) synapse_table_id = str(table.tableId) return table_data, table_name, synapse_table_id
def createAMPADTable(keyFile, clinicalFile): """ Create the AMP AD table with merged data from keyFile with clinicalFile. If any of the supplementary files exist for a particular dataset, change the binary classifiers to the synapse ID holding the data and reset 0 to null for the table. Input: keyFile: Dataframe with the keys and information regarding what exists for each patient clinicalFile: Dataframe with clinical data for various patients """ toUpload = [] clinicalHeader = clinicalFile.columns.values #seenList = [] # Iterate through each project within keyFile for i, row in keyFile.iterrows(): # Create empty list for new row to be added to synapse table newRow = [] # Ignore binary varibles which all end in '_data' for item in row.iteritems(): if (item[0] == 'niagas_data'): if (not pd.isnull(row.niagas_data)): newRow.append(arrayExpressionSynID) else: newRow.append(float('nan')) elif (not item[0].endswith('_data')): newRow.append(item[1]) # Check if row has clinical data if (row.clinical_data): # Create reference to clinicalFile project ID clinicalKeyList = clinicalFile['projid'] # get the index of the projID in the clinical file index = clinicalKeyList[clinicalKeyList == row.projid].index.tolist() if (len(index) == 1): index = index[0] #seenList.append(row.projid) for entry in clinicalFile.iloc[index][1:]: newRow.append(entry) # If the length of the idnex is 0, it means the key file thinks # there is clinical information for this patient but it does # not exist in the clinical file elif (len(index) == 0): print("Key file indicates that projID %s should have "\ "clinical information, but it does not exist in "\ "the clinical information file" % row.projid) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # If the lengh of index list is greater than 1, that means projID # appears more than once in the file. Send warning to user else: print("projID %s appears more than once in clinical file at "\ "positions %s" % (row.projid, index)) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) else: for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # Check if row has gwas data if (row.gwas_data): newRow.append(genotypeSynID) newRow.append(imputedGenotypeSynID) else: newRow.append(float('nan')) newRow.append(float('nan')) if (row.mwas_data): newRow.append(methylationSynID) else: newRow.append(float('nan')) if (row.mirna_data): newRow.append(mirnaSynID) else: newRow.append(float('nan')) if (row.mrna_data): newRow.append(rnaseqSynID) else: newRow.append(float('nan')) toUpload.append(newRow) df = pd.DataFrame(toUpload) columns = list(keyFile.columns.values) index = columns.index('clinical_data') - 1 columns.remove('clinical_data') idnex = columns.index('gwas_data') columns.remove('gwas_data') columns.insert(index + 1, 'genotype data') columns.insert(index + 2, 'imputed genotype data') for i in range(1, len(clinicalHeader)): columns.insert(index + i, clinicalHeader[i]) df.columns = columns df.to_csv('mergedTables.csv', encodings='utf-8', index=False) print("Uploading to Synapse") schema = Schema(name='AMP AD Samples Table', columns=as_table_columns(df), parent='syn2580853') syn.store(Table(schema, df))
def updateDatabase(syn, database, new_dataset, databaseSynId, uniqueKeyCols, toDelete=False): """ Updates synapse tables by a row identifier with another dataset that has the same number and order of columns :param database: The synapse table (pandas dataframe) :param new_dataset: New dataset (pandas dataframe) :param databaseSynId Synapse Id of the database table :param uniqueKeyCols: Column(s) that make up the unique key :returns: Don't know yet """ checkBy = 'UNIQUE_KEY' database = database.fillna("") new_dataset = new_dataset.fillna("") #Columns must be in the same order new_dataset = new_dataset[database.columns] database[uniqueKeyCols] = database[uniqueKeyCols].applymap(str) database[checkBy] = database[uniqueKeyCols].apply(lambda x: ' '.join(x), axis=1) new_dataset[uniqueKeyCols] = new_dataset[uniqueKeyCols].applymap(str) new_dataset[checkBy] = new_dataset[uniqueKeyCols].apply( lambda x: ' '.join(x), axis=1) updateSet = new_dataset[new_dataset[checkBy].isin(database[checkBy])] updatingDatabase = database[database[checkBy].isin(new_dataset[checkBy])] allRowIds = database.index.values rowIds = updatingDatabase.index.values #If you input the exact same dataframe theres nothing to update if updateSet.empty and updatingDatabase.empty: differentRows = [] else: allRowIds = database.index.values rowIds = updatingDatabase.index.values updateSet.index = updateSet[checkBy] updatingDatabase.index = updatingDatabase[checkBy] updateSet = updateSet.ix[updatingDatabase.index] differences = updateSet != updatingDatabase differentRows = differences.apply(sum, axis=1) > 0 if sum(differentRows) > 0: updatingDatabase.ix[differentRows] = updateSet.ix[differentRows] toUpdate = updatingDatabase.ix[differentRows] toUpdate.index = [ rowId for rowId, row in zip(rowIds, differentRows) if row ] del toUpdate[checkBy] print("Updating rows") table = syn.store(Table(syn.get(databaseSynId), toUpdate)) else: print("No updated rows") #All deleted rows (This assumes that all data that don't show up in the new uploaded data should be deleted...) if toDelete: database.index = allRowIds deleteSets = database[~database[checkBy].isin(new_dataset[checkBy])] del deleteSets[checkBy] if not deleteSets.empty: print("Deleting Rows") deleteRows = syn.delete(Table(syn.get(databaseSynId), deleteSets)) else: print("No deleted rows") #All new rows newSet = new_dataset[~new_dataset[checkBy].isin(database[checkBy])] if not newSet.empty: print("Adding Rows") del newSet[checkBy] table = syn.store(Table(syn.get(databaseSynId), newSet)) else: print("No new rows")
def archive(evaluation, destination=None, token=None, name=None, query=None): """ Archive the submissions for the given evaluation queue and store them in the destination synapse folder. :param evaluation: a synapse evaluation queue or its ID :param destination: a synapse folder or its ID :param query: a query that will return the desired submissions. At least the ID must be returned. defaults to _select * from evaluation_[EVAL_ID] where status=="SCORED"_. """ challenge = {'5877348':'FusionDetection','5952651':'IsoformQuantification'} if not query: query = 'select * from evaluation_%s where status=="SCORED"' % utils.id_of(evaluation) path = challenge[utils.id_of(evaluation)] ## for each submission, download it's associated file and write a line of metadata results = Query(query=query) if 'objectId' not in results.headers: raise ValueError("Can't find the required field \"objectId\" in the results of the query: \"{0}\"".format(query)) for result in results: #Check if the folder has already been created in synapse #(This is used as a tool to check submissions that have already been cached) new_map = [] mapping = syn.get("syn7348150") submissionId = result[results.headers.index('objectId')] check = syn.query('select id,name from folder where parentId == "%s" and name == "%s"' % (destination,submissionId)) if check['totalNumberOfResults']==0: os.mkdir(submissionId) submission = syn.getSubmission(submissionId, downloadLocation=submissionId) if submission.entity.externalURL is None: newFilePath = submission.filePath.replace(' ', '_') shutil.move(submission.filePath,newFilePath) #Store CWL file in bucket os.system('gsutil cp -R %s gs://smc-rna-cache/%s' % (submissionId,path)) with open(newFilePath,"r") as cwlfile: docs = yaml.load(cwlfile) merged = docs['$graph'] docker = [] for tools in merged: if tools['class'] == 'CommandLineTool': if tools.get('requirements',None) is not None: for i in tools['requirements']: if i.get('dockerPull',None) is not None: docker.append(i['dockerPull']) if tools.get('hints', None) is not None: for i in tools['hints']: if i.get('dockerPull',None) is not None: docker.append(i['dockerPull']) if tools['class'] == 'Workflow': hints = tools.get("hints",None) if hints is not None: for i in tools['hints']: if os.path.basename(i['class']) == "synData": temp = syn.get(i['entity']) #create synid and index mapping new_map.append([temp.id,"gs://smc-rna-cache/%s/%s/%s" %(path,submissionId,temp.name)]) #Store index files os.system('gsutil cp %s gs://smc-rna-cache/%s/%s' % (temp.path,path,submissionId)) os.system('rm -rf ~/.synapseCache/*') else: os.system('rm %s' % os.path.join(submissionId, submission.name)) test = subprocess.check_call(["python", os.path.join(os.path.dirname(__file__),"../../SMC-RNA-Eval/sbg-download.py"), "--token", token, submission.name, submissionId]) os.system('gsutil cp -R %s gs://smc-rna-cache/%s' % (submissionId,path)) #Pull down docker containers with open("%s/submission.cwl" % submissionId,"r") as cwlfile: docs = yaml.load(cwlfile) merged = docs['steps'] docker = [] for tools in merged: for hint in tools['run']['hints']: if hint['class'] == 'DockerRequirement': docker.append(hint['dockerPull']) for require in tools['run']['requirements']: if require.get('requirements') is not None: for i in require.get('requirements'): if i['class'] == 'DockerRequirement': docker.append(i['dockerPull']) os.system('rm -rf %s' % submissionId) if len(new_map) > 0: table = syn.store(Table(mapping, new_map)) #Pull, save, and store docker containers docker = set(docker) for i in docker: fileName = os.path.basename(i).replace(":","_") os.system('sudo -i docker pull %s' % i) #os.system('sudo -i docker save %s' % i) os.system('sudo docker save -o %s.tar %s' %(fileName,i)) os.system('sudo chmod a+r %s.tar' % fileName) os.system('gsutil cp %s.tar gs://smc-rna-cache/%s/%s' % (fileName,path,submissionId)) os.remove("%s.tar" % fileName) submission_parent = syn.store(Folder(submissionId,parent=destination))