def uploadToSynapse(f): """Given a filepath extracts metadata and uploads to Synapse""" center, sample_id, workflow_name, date, call_type, dataType, fileType = ['']*7 url = URLBASE+f if 'OICR_BL' in f: center = 'oicr_bl' elif 'CRG/clindel/somatic' in f: center = 'crg_clindel' else: center = f.split('/')[4] filename = f.split('/')[-1] if center in ('yale', 'wustl', 'LOHcomplete'): if filename =='bd829214-f230-4331-b234-def10bbe7938CNV.vcf.gz': sample_id, dataType, fileType='bd829214-f230-4331-b234-def10bbe7938', 'cnv', 'vcf' else: sample_id, dataType = filename.lower().split('.')[:2] fileType = [i for i in filename.split('.')[2:] if i != 'gz'][-1] elif center in ('broad', 'BSC', 'oicr_sga', 'mda_kchen', 'MDA_HGSC', 'mcgill_popsv', 'sfu', 'UCSC', 'oicr_bl', 'Synteka_pgm21', 'crg_clindel'): sample_id, workflow_name, date, call_type, dataType = filename.replace('indels', 'indel', split('.')[:5]) fileType = [i for i in filename.split('.')[5:] if i != 'gz'][-1] else: print 'Not uploading:', f return print center, workflow_name, date, call_type, dataType, fileType file = File(url, parentId=DIRS[center], synapseStore=False) file.center = center.lower() file.sample_id = sample_id file.workflow_name = workflow_name file.date = date file.call_type = call_type file.dataType = 'DNA' file.disease = 'Cancer' file.dataSubType = dataType file.fileType = fileType #file.analysis_id_tumor = ????? syn.store(file, forceVersion=False)
def storeFile(fileName, stagingID, used, center, annotations, meta=False): print("STORING FILES") fileEnt = File(fileName, parent = stagingID) fileEnt.center = center fileEnt.dataSubType = annotations.get("dataSubType",'') fileEnt.dataType = annotations.get("dataType",'') fileEnt.disease = 'cancer' fileEnt.fileType = annotations.get("fileType",'') fileEnt.organism = 'H**o Sapiens' fileEnt.platform = annotations.get("platform",'') fileEnt.tissueSource = annotations.get("tissueSource",'') fileEnt.consortium = 'GENIE' if meta: fileEnt.fileType = "txt" fileEnt.dataType = "meta" fileEnt.fileStage = "staging" ent = syn.store(fileEnt,annotations = used) return(ent)
'tissueType': ['Frontal Pole', 'Superior Temporal Gyrus', 'Parahippocampal Gyrus'], 'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name': 'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_normalized-sex-race-age-RIN-PMI-batch-site.corrected.csv' }, } for id, v in toMove.items(): ent = syn.get(id) print v['name'] os.rename(ent.path, v['name']) f = File(v['name'], parentId=v['parentId'], name=v['name'][7:-4]) print f.name f.consortium, f.study, f.center, f.disease = consortium, study, center, disease f.dataType = v['dataType'] f.platfrom = v['platform'] if 'tissueTypeAbrv' in v: f.tissueTypeAbrv = v['tissueTypeAbrv'] f.tissueType = v['tissueType'] f.fileType = fileType f.organism = organism f = syn.store( f, used=[id], executed=[ 'https://github.com/Sage-Bionetworks/ampAdScripts/blob/e71bbde262625e6999ea9defd98e10fce8f3c542/Mount-Sinai/migrateMSBBMetaAndRNASeq.py' ], activityName='Data migration')
PLATFORM_MAP = {'133AB': 'AffymetrixU133AB', 'Plus2': 'AffymetrixU133Plus2'} query = 'select id, name from entity where parentId=="%s"' %OLDPARENTID df = synapseHelpers.query2df(syn.chunkedQuery(query)) for i in range(1,df.shape[0]): row = df.ix[i, :] ent = syn.get(row.id) fStudy, fTissue, fPlatform, fDatatype, fRest = ent.name.split('_') name = 'AMP-AD_MSBB_MSSM_%s_%s_%s' % (PLATFORM_MAP[fPlatform], TISSUEABRMAP[fTissue][0], fRest) print name os.rename(ent.path, name) f = File(name, parentId=NEWPARENTID, name=name[7:]) f.consortium = 'AMP-AD' f.study = 'MSBB' f.center = 'MSSM' f.dataType = 'mRNA' f.disease = 'Alzheimers Disease' f.platfrom = PLATFORM_MAP[fPlatform] f.tissueTypeAbrv = TISSUEABRMAP[fTissue][1] f.tissueType = TISSUEABRMAP[fTissue][0] f.dataSubType = 'geneExp' f.fileType = 'genomicMatrix' f.organism = 'human' f = syn.store(f, used = [ent], executed=['https://github.com/Sage-Bionetworks/ampAdScripts/blob/4d7d6b78b1e73058483354a1a18bff7422966a4b/Mount-Sinai/migrateMSBBExpression.py'], activityName='Data migration')
DST_FOLDER = 'syn3079564' #test upload folder #Create Provenance log provenance = Activity(name=meta_data['activity'], desciption=meta_data['description'], used = meta_data['used'] exectuted = meta_data['used'] ) #prov = syn.store(prov) name = of.path.basename(input_path) #Add metadata to files to be uploaded f = File(input_path, name = name, parentId=DST_FOLDER) f.dataType = meta_data['dataType'] f.fileType = meta_data['dataType'] f.variant_workflow = meta_data['workflow'] f.variant_workflow_version = meta_data['workflowVersion'] f.call_type = call_type f.reference_build = meta_data['referenceBuild'] f.center_name = meta_data['center_name'] f.file_md5 = synapseclient.utils.md5_for_file(input_path) f.study = 'PCAWG 2.0' f.submitter_donor_id = meta_data['donor_id'] f.alignment_workflow_name='Workflow_Bundle_BWA (UCSC Implementation)' f.alignment_workflow_source_url='https://github.com/kellrott/tcga_realign' f.alignment_workflow_version='2.6.0' #Store metadata and file to Synapse #f = syn.store(f, activity = provenance)
path = os.path.join(dirpath, filename) stat = os.stat(path) if stat.st_size > 0: mtime = stat.st_mtime previous_mtime = previous_uploads.get(path, None) if mtime > previous_mtime: print('Uploading {0}...'.format(path)) f = File(path, parent=parents[dirpath], name=filename) #------------------------------------------------------------- # Annotate the file on Synapse: #------------------------------------------------------------- for istr2, str2 in enumerate(types): if filename.endswith(str2): f.fileType = type_names[istr2] # Optionally add "syn.store(f, used='http://..)" # to specify the source location syn.store(f) c = conn.cursor() c.execute('INSERT OR REPLACE INTO files (path, mtime) VALUES ("%s", "%s")' % (path, mtime)) conn.commit() finally: conn.close()
'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name' :'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_rawCounts.tsv'}, 'syn2920161':{'parentId' :'syn3157743', #'normalized.sex_race_age_RIN_PMI_batch_site.corrected.csv' 'dataType': 'mRNA', 'platform': 'IlluminaHiSeq2500', 'tissueType':['Frontal Pole', 'Superior Temporal Gyrus','Parahippocampal Gyrus'], 'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name' :'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_normalized-sex-race-age-RIN-PMI-batch-site.corrected.csv'}, } for id, v in toMove.items(): ent = syn.get(id) print v['name'] os.rename(ent.path, v['name']) f = File(v['name'], parentId=v['parentId'], name=v['name'][7:-4]) print f.name f.consortium, f.study, f.center, f.disease = consortium, study, center, disease f.dataType = v['dataType'] f.platfrom = v['platform'] if 'tissueTypeAbrv' in v: f.tissueTypeAbrv = v['tissueTypeAbrv'] f.tissueType = v['tissueType'] f.fileType = fileType f.organism = organism f = syn.store(f, used = [id], executed=['https://github.com/Sage-Bionetworks/ampAdScripts/blob/e71bbde262625e6999ea9defd98e10fce8f3c542/Mount-Sinai/migrateMSBBMetaAndRNASeq.py'], activityName='Data migration')