def findFilesAlreadyInSynapse(): """Determine the files already stored in Synapse""" allFiles= synapseHelpers.query2df(syn.chunkedQuery("select * from file where benefactorId=='syn2351328'"), False) print 'Found', len(allFiles), 'files in Synapse. Fetching urls...' def get(id): print id return syn.get(id, downloadFile=False) entities = p.map(get, allFiles.id) return entities
def get_annotations(self): _q_params = dict(cols=",".join(self._COLS), id=self._INPUT_BENEFACTOR_ID) _query = "select %(cols)s from file where benefactorId=='%(id)s'" % _q_params logger.debug(_query) # This is a large query, so need to chunk it # Make a temporary dict so it looks the same qr = self.syn.chunkedQuery(_query) annot_qry = dict(results=map(self.fixRow, qr)) self.annots = synapseHelpers.query2df(annot_qry, filterSynapseFields=False) return self.annots
'BM4-PCG': ['Precentral Gyrus', 'PCG'], 'BM44-IFG': ['Inferior Frontal Gyrus', 'IFG'], 'BM46-PFC': ['Dorsolateral Prefrontal Cortex', 'PFC'], 'BM7-SPL': ['Superior Parietal Lobule', 'SPL'], 'BM8-FC': ['Prefrontal Cortex', 'FC'], 'BMa-AMYG': ['Amygdala','AMYG'], 'BMb-CD': ['Caudate Nucleus','CD'], 'BMc-HIPP': ['Hippocampus', 'HIPP'], 'BMd-NAc': ['Nucleus Accumbens','NAc'], 'Bme-PT': ['Putamen','PT']} PLATFORM_MAP = {'133AB': 'AffymetrixU133AB', 'Plus2': 'AffymetrixU133Plus2'} query = 'select id, name from entity where parentId=="%s"' %OLDPARENTID df = synapseHelpers.query2df(syn.chunkedQuery(query)) for i in range(1,df.shape[0]): row = df.ix[i, :] ent = syn.get(row.id) fStudy, fTissue, fPlatform, fDatatype, fRest = ent.name.split('_') name = 'AMP-AD_MSBB_MSSM_%s_%s_%s' % (PLATFORM_MAP[fPlatform], TISSUEABRMAP[fTissue][0], fRest) print name os.rename(ent.path, name) f = File(name, parentId=NEWPARENTID, name=name[7:]) f.consortium = 'AMP-AD' f.study = 'MSBB' f.center = 'MSSM' f.dataType = 'mRNA' f.disease = 'Alzheimers Disease'
import synapseclient from synapseclient import File import synapseHelpers import pandas as pd import os, urllib, urlparse import multiprocessing.dummy as mp from collections import Counter QUERY = "select * from file where projectId=='syn2351328' and dataType=='DNA' and call_type=='somatic'" syn = synapseclient.Synapse(skip_checks=True) syn.login(silent=True) if __name__ == '__main__': df= synapseHelpers.query2df(syn.chunkedQuery(QUERY), True, ['name', 'id', 'parentId']) df = df[[x in ('snv_mnv', 'sv', 'indel', 'cnv') for x in df.dataSubType]] #Pretyify Source names: df['source'] = [c.split('_')[0].upper() if isinstance(c, basestring) else '' for c in df.center] df.source[df.workflow_name =='SangerPancancerCgpCnIndelSnvStr'] = 'Sanger' #Summarize number of samples counts = pd.pivot_table(df, 'sample_id', rows=['source', 'workflow_name'], cols = ['dataSubType'], aggfunc=lambda x: len(set(x))) #Display number of samples #counts.plot(kind='bar') #Attempt at getting rid of missing bars
id = syn._findEntityIdByNameAndParent(name, parentId) if id is None: return False activity = syn.getProvenance(id) used = set([ '%s.%s' % (x['reference']['targetId'], x['reference']['targetVersionNumber']) for x in activity['used'] if x['wasExecuted'] == False ]) currentVersions = set(['%s.%s' % (x.id, x.versionNumber) for x in files]) return currentVersions == used mp = Pool(8) syn = synapseclient.login(silent=True) allFiles = query2df(syn.chunkedQuery(query_str)) for platform, dataSubType, name in platforms: print platform, dataSubType, filteredMeta = allFiles[(allFiles.platform == platform) & (allFiles.dataSubType == dataSubType) & (allFiles.acronym != 'PANCAN')] files = mp.map(syn.get, filteredMeta.id) if isUptodate(name, files, args.parentId): print ' is up to date' continue if list(set(filteredMeta.fileType))[0] in ['seg', 'bed']: dfs = mp.map(lambda f: pd.read_csv(f.path, sep='\t'), files) df = pd.concat(dfs, axis=0) df.to_csv(args.filepath + name, sep='\t', index=False) nSamples = len(set(df.Sample)) nFeatures = 0
metadata['nFeatures'] = nFeatures metadata['samples'] = samples metadata['patient_barcode'] = [x[:12] for x in metadata.samples] metadata.drop(['tissue', u'md5', u'assembly'], axis=1, inplace=True) metadata.nFeatures = metadata.nFeatures.astype('int') cols = syn.tableQuery('select * from %s limit 1' %args.tableId).asDataFrame().columns #Update rows in table print 'adding', metadata.shape[0] t = syn.store(Table(tableId, metadata[cols])) return metadata if __name__ == '__main__': parser = argparse.ArgumentParser(description=('Updates a Synapse Table with ' 'sample sampling information')) parser.add_argument('-t', '--table', dest='tableId', default='syn3281840', help='Table where results are stored (e.g. syn3281840) ') parser.add_argument('-p', '--project', dest='projectId', default='syn2812961', help='Project (benefactorId) where output files are stored. (e.g. syn2812961)') args = parser.parse_args() files = synapseHelpers.query2df(syn.chunkedQuery(FILEQUERY % args.projectId), savedSynapseFields=('id', 'name', 'versionNumber')) updatedFiles= findUpdates(files, args.tableId) print 'NEED TO UPDATE:', updatedFiles.shape[0], 'FILES' deleteAffectedRows(updatedFiles, args.tableId) dfs = [countAndUpdateTable(row, tableId=args.tableId) for row in updatedFiles.iterrows()]
return False activity = syn.getProvenance(id) used = set( [ "%s.%s" % (x["reference"]["targetId"], x["reference"]["targetVersionNumber"]) for x in activity["used"] if x["wasExecuted"] == False ] ) currentVersions = set(["%s.%s" % (x.id, x.versionNumber) for x in files]) return currentVersions == used mp = Pool(8) syn = synapseclient.login(silent=True) allFiles = query2df(syn.chunkedQuery(query_str)) for platform, dataSubType, name in platforms: print platform, dataSubType, filteredMeta = allFiles[ (allFiles.platform == platform) & (allFiles.dataSubType == dataSubType) & (allFiles.acronym != "PANCAN") ] files = mp.map(syn.get, filteredMeta.id) if isUptodate(name, files, args.parentId): print " is up to date" continue if list(set(filteredMeta.fileType))[0] in ["seg", "bed"]: dfs = mp.map(lambda f: pd.read_csv(f.path, sep="\t"), files) df = pd.concat(dfs, axis=0) df.to_csv(args.filepath + name, sep="\t", index=False) nSamples = len(set(df.Sample)) nFeatures = 0
def getChangeSet(version, platform): """Extracts the old whitelist id and version of used and filters the changes down to a specific platform.""" old_whitelist = syn.get(WHITELISTID, version=version) whitelist = pd.read_csv(whitelistEntity.path, sep='\t') oldToRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform==platform), 'aliquot_barcode']) return oldToRemove #mp = Pool(8) syn = synapseclient.login(silent=True) whitelistEntity = syn.get(WHITELISTID) whitelist = pd.read_csv(whitelistEntity.path, sep='\t') inputFiles = synapseHelpers.query2df(syn.chunkedQuery(QUERY_STR)) code=synapseHelpers.thisCodeInSynapse(parentId='syn1774100') for i, row in inputFiles.iterrows(): print row.id, row['name'], inputFileEntity = syn.get(row.id) outFileName = row['name'][:-4]+'_whitelisted'+row['name'][-4:] toRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform == row['platform']), 'aliquot_barcode']) if isUptodate(outFileName, [inputFileEntity], toRemove, row['platform']): print ' is up to date - but update provenance' e = syn.get(getFileIdFromName(outFileName), downloadFile=False) syn.store(e, used=[inputFileEntity, whitelistEntity], executed=code) continue