def loadOneSample(a): """Goes through a single json annotation file a and: 1) Finds the parent Folder where to store the file (or makes directories) 2) Fetches the md5 of any existing file and compares 3) If new or different md5 upload file. """ logging.debug("Loading:" + a) with open(a) as handle: meta = json.load(handle) dpath = re.sub(r'.json$', '', a) #Skip the rest of the loop if data file is empty or we are not doing the current acronyms if os.stat(dpath).st_size == 0 or ( args.acronym != meta['annotations']['acronym'] and args.acronym is not None): return parentId = getParentFolder(syn, args.project, meta) #Determine if we are updating an existing file and if we should update based on md5 query = "select id from entity where parentId=='%s' and name=='%s'" % ( parentId, meta['name']) res = list(syn.chunkedQuery(query)) if len(res) != 0: tmp_ent = syn.get(res[0]['entity.id'], downloadFile=False) upload = (tmp_ent.md5 != meta['annotations']['md5']) logging.debug("\tFound: %s and upload (MD5 %s match)" % (tmp_ent.id, 'DOESN\'T' if upload else 'does')) else: logging.debug("\tNot found:" + meta['name']) upload = True #Prepare the entity for upload if upload and not args.push: logging.info("\tWILL UPLOAD: %s" % meta['name']) if upload and args.push: entity = File(dpath, name=meta['name'], parentId=parentId, annotations=meta['annotations']) if 'provenance' in meta: #Fix labels for urls for u in meta['provenance']['used']: if 'name' not in u and 'url' in u: u['name'] = u['url'] prov = Activity(data=meta['provenance']) prov.executed('https://github.com/Sage-Bionetworks/tcgaImport') else: prov = None logging.debug('\tUploading:%s' % entity.name) entity = syn.store(entity, activity=prov) logging.debug('\tCreated/Updated: **** %s ****' % entity.id)
def add_workflow_step_to_synapse(inFilePath, stepDict, step='1', software=None, parentid=None, syn=None, stepIDs=None, inFilename=None): '''Uploads files with provenance and annotations to Synapse.''' usedList = None if not inFilename: inFilename = os.path.basename(inFilePath.strip()) if not software: software = stepDict['softwareName'] if 'used' in stepDict: usedList = stepDict['used'].strip().split(',') if 'depends' in stepDict: usedList.append(stepIDs[stepDict['depends']]) elif 'depends' in stepDict: usedList = stepIDs[stepDict['depends']] execList = stepDict['executed'].strip().split(';') act = Activity(name=stepDict['actName'], description=stepDict['description']) if usedList is not None: act.used(usedList) for item in execList: splitItem = item.split(',') target = splitItem[0] version = 1 if (len(splitItem) > 1): version = splitItem[1] if target.startswith('http'): act.executed(url=target, name=os.path.basename(target)) else: act.executed(target=target, targetVersion=version) step_file = File(path=inFilePath, name=inFilename, description=stepDict['fileDescription'], parentId=parentid, synapseStore=str2bool(stepDict['store'])) step_file = syn.store(step_file, activity=act, forceVersion=False) if 'annotations' in stepDict: syn.setAnnotations(step_file, annotations=stepDict['annotations']) print 'new entity id %s' % step_file.id return (step_file.id)
def loadOneSample(a): """Goes through a single json annotation file a and: 1) Finds the parent Folder where to store the file (or makes directories) 2) Fetches the md5 of any existing file and compares 3) If new or different md5 upload file. """ logging.debug( "Loading:" + a ) with open(a) as handle: meta = json.load(handle) dpath = re.sub(r'.json$', '', a) #Skip the rest of the loop if data file is empty or we are not doing the current acronyms if os.stat(dpath).st_size==0 or (args.acronym != meta['annotations']['acronym'] and args.acronym is not None): return parentId= getParentFolder(syn, args.project, meta) #Determine if we are updating an existing file and if we should update based on md5 query = "select id from entity where parentId=='%s' and name=='%s'" % (parentId, meta['name']) res = list(syn.chunkedQuery(query)) if len(res) != 0: tmp_ent = syn.get(res[0]['entity.id'], downloadFile=False) upload = (tmp_ent.md5 != meta['annotations']['md5']) logging.debug( "\tFound: %s and upload (MD5 %s match)" %(tmp_ent.id, 'DOESN\'T' if upload else 'does')) else: logging.debug("\tNot found:" + meta['name']) upload = True #Prepare the entity for upload if upload and not args.push: logging.info( "\tWILL UPLOAD: %s" %meta['name']) if upload and args.push: entity = File(dpath, name=meta['name'], parentId=parentId, annotations=meta['annotations']) if 'provenance' in meta: #Fix labels for urls for u in meta['provenance']['used']: if 'name' not in u and 'url' in u: u['name'] = u['url'] prov = Activity(data=meta['provenance']) prov.executed('https://github.com/Sage-Bionetworks/tcgaImport') else: prov=None logging.debug('\tUploading:%s' %entity.name) entity = syn.store(entity, activity=prov) logging.debug('\tCreated/Updated: **** %s ****' %entity.id)
def test_activity_used_execute_methods(): """test activity creation and used and execute methods""" a = Activity(name='Fuzz', description='hipster beard dataset') a.used({ 'id': 'syn101', 'versionNumber': 42, 'concreteType': 'org.sagebionetworks.repo.model.FileEntity' }) a.executed('syn102', targetVersion=1) usedEntities = a['used'] len(usedEntities), 2 assert a['name'] == 'Fuzz' assert a['description'] == 'hipster beard dataset' used_syn101 = utils._find_used( a, lambda res: res['reference']['targetId'] == 'syn101') assert used_syn101['reference']['targetVersionNumber'] == 42 assert not used_syn101['wasExecuted'] used_syn102 = utils._find_used( a, lambda res: res['reference']['targetId'] == 'syn102') assert used_syn102['reference']['targetVersionNumber'] == 1 assert used_syn102['wasExecuted']
import synapseclient from synapseclient import File, Activity syn = synapseclient.Synapse() syn.login() ### Ensembl raw counts annotDict = dict() annotDict['fileType'] = 'count' annotDict['normalized'] = 'no' annotDict['summaryLevel'] = 'gene' act = Activity(name='Counting', description='Raw gene counts using HTSeq.') act.used(['syn2290932', 'syn2215531']) # syn2290932 is BAM, syn2215531 is GTF act.executed('syn2243147') # syn2243147 is htseq counts = File( path= '/projects/CommonMind/data/FROM_CORE/Production/readCounts/CMC.DataFreeze.CountMatrix_V7.ensemble.Clean.txt', name='PFC_CountMatrix_ensembl.txt', description= 'Gene counts for all BAMs summarized using Ensembl gene models. QC counts (e.g. \"ambiguous\") from HTSeq are not included.', parentId='syn2290933', synapseStore=True) counts = syn.store(counts, activity=act) syn.setAnnotations(counts, annotations=annotDict)
def test_activity_used_url(): """test activity creation with UsedURLs""" u1 = 'http://xkcd.com' u2 = {'name': 'The Onion', 'url': 'http://theonion.com'} u3 = { 'name': 'Seriously advanced code', 'url': 'https://github.com/cbare/Pydoku/blob/ef88069f70823808f3462410e941326ae7ffbbe0/solver.py', 'wasExecuted': True } u4 = { 'name': 'Heavy duty algorithm', 'url': 'https://github.com/cbare/Pydoku/blob/master/solver.py' } a = Activity(name='Foobarbat', description='Apply foo to a bar and a bat', used=[u1, u2, u3], executed=[u3, u4]) a.executed(url='http://cran.r-project.org/web/packages/glmnet/index.html', name='glm.net') a.used(url='http://earthquake.usgs.gov/earthquakes/feed/geojson/2.5/day', name='earthquakes') u = utils._find_used(a, lambda res: 'url' in res and res['url'] == u1) assert u is not None assert u['url'] == u1 assert not u['wasExecuted'] u = utils._find_used( a, lambda res: 'name' in res and res['name'] == 'The Onion') assert u is not None assert u['url'] == 'http://theonion.com' assert not u['wasExecuted'] u = utils._find_used( a, lambda res: 'name' in res and res['name'] == 'Seriously advanced code') assert u is not None assert u['url'] == u3['url'] assert u['wasExecuted'] == u3['wasExecuted'] u = utils._find_used( a, lambda res: 'name' in res and res['name'] == 'Heavy duty algorithm') assert u is not None assert u['url'] == u4['url'] assert u['wasExecuted'] u = utils._find_used( a, lambda res: 'name' in res and res['name'] == 'glm.net') assert u is not None assert u[ 'url'] == 'http://cran.r-project.org/web/packages/glmnet/index.html' assert u['wasExecuted'] u = utils._find_used( a, lambda res: 'name' in res and res['name'] == 'earthquakes') assert u is not None assert u[ 'url'] == 'http://earthquake.usgs.gov/earthquakes/feed/geojson/2.5/day' assert not u['wasExecuted']
import synapseclient from synapseclient import File, Activity syn = synapseclient.Synapse() syn.login() ### Ensembl raw counts annotDict = dict() annotDict['fileType'] = 'count' annotDict['normalized'] = 'no' annotDict['summaryLevel'] = 'gene' act = Activity(name='Counting', description='Raw gene counts using HTSeq.') act.used(['syn2290932', 'syn2215531']) # syn2290932 is BAM, syn2215531 is GTF act.executed('syn2243147') # syn2243147 is htseq counts = File(path='/projects/CommonMind/data/FROM_CORE/Production/readCounts/CMC.' 'DataFreeze.CountMatrix_V7.ensemble.Clean.txt', name='PFC_CountMatrix_ensembl.txt', description='Gene counts for all BAMs' 'summarized using Ensembl gene models. QC counts (e.g. \"ambiguous\") from HTSeq' 'are not included.', parentId='syn2290933', synapseStore=True) counts = syn.store(counts, activity=act) syn.setAnnotations(counts, annotations=annotDict) # Need to check: # - annotations: standards? # - synapseclient: auto-return synapse id on upload # - syn.store: specify activity independently? # - Activity.executed: executable file, or just script?