def choose_reference(experiment, biorep_n, server, keypair, sex_specific): replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']] replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n) logging.debug('Replicate uuid %s' %(replicate.get('uuid'))) organism_uri = replicate.get('library').get('biosample').get('organism') organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair) try: organism_name = organism_obj['name'] except: logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n)) raise return None else: logging.debug("Organism name %s" %(organism_name)) if sex_specific: try: sex = replicate.get('library').get('biosample').get('sex') assert sex in ['male', 'female'] except: logging.warning('%s:rep%d Sex is %s. Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex)) sex = 'male' logging.debug('Organism %s sex %s' %(organism_name, sex)) else: sex = 'male' genome_assembly = args.assembly reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None) logging.debug('Found reference %s' %(reference)) return reference
def get_rep_bams(experiment, keypair, server): original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment.get('original_files')] #resolve the biorep_n for each fastq for fastq in [f for f in original_files if f.get('file_format') == 'fastq']: replicate = common.encoded_get(urlparse.urljoin(server,'%s' %(fastq.get('replicate'))), keypair) fastq.update({'biorep_n' : replicate.get('biological_replicate_number')}) #resolve the biorep_n's from derived_from for each bam for bam in [f for f in original_files if f.get('file_format') == 'bam']: biorep_ns = set() for derived_from_uri in bam.get('derived_from'): derived_from_accession = os.path.basename(derived_from_uri.strip('/')) #this assumes frame=object biorep_ns.add(next(f.get('biorep_n') for f in original_files if f.get('accession') == derived_from_accession)) if len(biorep_ns) != 1: logger.error("%s %s expected 1 biorep_n, found %d, skipping." %(experiment_accession, bam.get('accession'))) return else: biorep_n = biorep_ns.pop() bam.update({'biorep_n': biorep_n}) #remove any bams that are older than another bam (resultsing in only the most recent surviving) for bam in [f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == biorep_n and after(bam.get('date_created'), f.get('date_created'))]: original_files.remove(bam) rep1_bam = next(f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == 1) rep2_bam = next(f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == 2) return rep1_bam, rep2_bam
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids else: ids = args.infile formats = ['bed_narrowPeak', 'bed_gappedPeak'] fieldnames = ['file','analysis','experiment','replicates','output_name','file_format','output_type','target','biosample_term_name','biosample_term_id','biosample_type','biosample_life_stage','biosample_age','biosample_organism'] writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t') writer.writeheader() for (i, analysis_id) in enumerate(ids): analysis_id = analysis_id.rstrip() logger.info('%s' %(analysis_id)) try: files = analysis_files(analysis_id, keypair, server, args.assembly) except: logger.error('%s error finding analysis_files. Check experiment metadata.' %(analysis_id)) for f in [f_obj for f_obj in files if f_obj.get('file_format') in formats]: fid = f['dx'].get_id() local_path = os.path.join(args.outdir,fid) if not os.path.isfile(local_path): if not os.path.exists(args.outdir): os.makedirs(args.outdir) dxpy.download_dxfile(fid, local_path) replicates = [] for derived_from in f['derived_from']: rep_ns = common.biorep_ns(derived_from, server, keypair) for r in rep_ns: replicates.append(r) experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(f['dataset'])), keypair) rep = common.encoded_get(urlparse.urljoin(server, experiment['replicates'][0]), keypair) lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair) biosample = common.encoded_get(urlparse.urljoin(server, lib['biosample']), keypair) writer.writerow({ 'file': fid, 'analysis': analysis_id, 'experiment': experiment.get('accession'), 'replicates': replicates, 'output_name': f.get('name'), 'file_format': f.get('file_format'), 'output_type': f.get('output_type'), 'target': experiment.get('target'), 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_term_id': experiment.get('biosample_term_id'), 'biosample_type': experiment.get('biosample_type'), 'biosample_life_stage': biosample.get('life_stage'), 'biosample_age': biosample.get('age'), 'biosample_organism': biosample.get('organism')})
def get_rep_fastqs(experiment, keypair, server, repn): fastq_valid_status = ['released','in progress','uploaded'] logger.debug('in get_rep_fastqs with experiment[accession] %s rep %d' %(experiment.get('accession'), repn)) original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment.get('original_files')] fastqs = [f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') in fastq_valid_status] #resolve the biorep_n for each fastq rep_fastqs = [f for f in fastqs if common.encoded_get(urlparse.urljoin(server,'%s' %(f.get('replicate'))), keypair).get('biological_replicate_number') == repn] logger.debug('get_rep_fastqs returning %s' %([f.get('accession') for f in rep_fastqs])) return rep_fastqs
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) project = resolve_project(args.project) SRR_files = dxpy.find_data_objects( name="SRR???????_?.fastq.gz", name_mode='glob', classname='file', recurse=True, return_handler=True, folder=args.folder, project=args.project) for srr_dxfile in SRR_files: m = re.search('(SRR.{7})_(\d)', srr_dxfile.name) if m: srr_basename = m.group(1) end_num = m.group(2) else: assert m srr_encfiles = common.encoded_get('/'.join([server,'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename)]), keypair)['@graph'] if not srr_encfiles: logging.error('%s object not found at ENCODE. Skipping.' % (srr_basename)) continue elif len(srr_encfiles) > 1: logging.error('%s multiple matching objects found at ENCODE. Skipping.' % (srr_basename)) continue else: srr_encfile = srr_encfiles[0] # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair) # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair) # biorep_n = replicate.get('biological_replicate_number') all_fastqs = common.encoded_get('/'.join([ server, 'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename) ]), keypair)['@graph'] if not all_fastqs: print("%s: no fastq(s) found. Skipping." % (srr_dxfile.name)) continue if end_num == '1': fastqs = [f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num] elif end_num in ['2', '3']: fastqs = [f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2'] if not fastqs: print("%s: no fastq(s) found for paired_end %s. Skipping" % (srr_basename, end_num)) continue elif len(fastqs) > 1: print("%s: ambiguous matches to %s. Skipping" % (srr_basename, [f.get('accession') for f in fastqs])) continue else: fastq = fastqs[0] newname = '%s.fastq.gz' % (fastq.get('accession')) if args.dry_run: print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname)) else: srr_dxfile.set_properties({'srr_filename': srr_dxfile.name}) srr_dxfile.rename(newname) print('%s renamed to %s' % (srr_dxfile.name, newname))
def get_rep_bams(experiment, assembly, keypair, server): original_files = [ common.encoded_get(urlparse.urljoin(server, '%s' % (uri)), keypair) for uri in experiment.get('original_files') ] #resolve the biorep_n for each fastq for fastq in [ f for f in original_files if f.get('file_format') == 'fastq' ]: replicate = common.encoded_get( urlparse.urljoin(server, '%s' % (fastq.get('replicate'))), keypair) fastq.update( {'biorep_n': replicate.get('biological_replicate_number')}) #resolve the biorep_n's from derived_from for each bam for bam in [ f for f in original_files if f.get('file_format') == 'bam' and f.get('assembly') == assembly ]: biorep_ns = set() for derived_from_uri in bam.get('derived_from'): derived_from_accession = os.path.basename( derived_from_uri.strip('/')) #this assumes frame=object biorep_ns.add( next( f.get('biorep_n') for f in original_files if f.get('accession') == derived_from_accession)) if len(biorep_ns) != 1: logger.error("%s %s expected 1 biorep_n, found %d, skipping." % (experiment_accession, bam.get('accession'))) return else: biorep_n = biorep_ns.pop() bam.update({'biorep_n': biorep_n}) #remove any bams that are older than another bam (resultsing in only the most recent surviving) for bam in [ f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == biorep_n and after(bam.get('date_created'), f.get('date_created')) ]: original_files.remove(bam) rep1_bam = next( f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == 1) rep2_bam = next( f for f in original_files if f.get('file_format') == 'bam' and f.get('biorep_n') == 2) return rep1_bam, rep2_bam
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): exp_id = experiment['accession'] #Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server+uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) logging.debug(pprint.pformat(possible_control_experiments)) try: matching_ta = next(ta for ta in [get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None else: return matching_ta try: any_ta = next(ta for ta in common.flat([get_all_tas(e, default_project, ta_folders) for e in possible_control_experiments]) if ta and ta['id'] not in used_control_ids) except StopIteration: logging.error('Failed to find any possible control') return None else: return any_ta
def biorep_ns(file_accession,server,keypair): m = re.match('^/?(files)?/?(\w*)', file_accession) if m: acc = m.group(2) else: return url = urlparse.urljoin(server, '/files/%s' %(acc)) file_object = common.encoded_get(url, keypair) if file_object.get('derived_from'): for f in file_object.get('derived_from'): for repnum in biorep_ns(f,server,keypair): yield repnum else: url = urlparse.urljoin(server, '%s' %(file_object.get('replicate'))) replicate_object = common.encoded_get(url, keypair) yield replicate_object.get('biological_replicate_number')
def files_to_map(exp_obj, server, keypair, no_sfn_dupes): if not exp_obj or not (exp_obj.get('files') or exp_obj.get('original_files')): logging.warning('Experiment %s or experiment has no files' %(exp_obj.get('accession'))) files = [] else: files = [] for file_uri in exp_obj.get('original_files'): file_obj = common.encoded_get(urlparse.urljoin(server, file_uri), keypair=keypair) if file_obj.get('status') in FILE_STATUSES_TO_MAP and \ file_obj.get('output_type') == 'reads' and \ file_obj.get('file_format') in FILE_FORMATS_TO_MAP and \ file_obj.get('replicate'): if file_obj.get('submitted_file_name') in filenames_in(files): if no_sfn_dupes: logging.error('%s:%s Duplicate submitted_file_name found, skipping that file.' %(exp_obj.get('accession'),file_obj.get('accession'))) else: logging.warning('%s:%s Duplicate submitted_file_name found, but allowing duplicates.' %(exp_obj.get('accession'),file_obj.get('accession'))) files.extend([file_obj]) else: files.extend([file_obj]) elif file_obj.get('output_type') == 'reads' and \ file_obj.get('file_format') in FILE_FORMATS_TO_MAP and not file_obj.get('replicate'): logging.error('%s: Reads file has no replicate' %(file_obj.get('accession'))) logging.debug('returning from files_to_map with %s' % (pprint.pformat(files))) return files
def files_to_map(exp_obj, server, keypair, sfn_dupes=False): if not exp_obj or not (exp_obj.get('files') or exp_obj.get('original_files')): logging.warning('Experiment %s or experiment has no files' % (exp_obj.get('accession'))) return [] else: files = [] for file_uri in exp_obj.get('original_files'): file_obj = common.encoded_get(urlparse.urljoin(server, file_uri), keypair=keypair) if file_obj.get('status') in FILE_STATUSES_TO_MAP and \ file_obj.get('output_type') == 'reads' and \ file_obj.get('file_format') == 'fastq' and \ file_obj.get('replicate'): if file_obj.get('submitted_file_name') in filenames_in(files): if sfn_dupes: logging.warning( '%s:%s Duplicate submitted_file_name found, but allowing duplicates.' % (exp_obj.get('accession'), file_obj.get('accession'))) files.extend([file_obj]) else: logging.error( '%s:%s Duplicate submitted_file_name found, skipping that file.' % (exp_obj.get('accession'), file_obj.get('accession'))) else: files.extend([file_obj]) elif file_obj.get('output_type') == 'reads' and \ file_obj.get('file_format') == 'fastq' and not file_obj.get('replicate'): logging.error('%s: Fastq has no replicate' % (file_obj.get('accession'))) return files
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): exp_id = experiment['accession'] #Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server + uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) logging.debug(pprint.pformat(possible_control_experiments)) try: matching_ta = next(ta for ta in [ get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments ] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None else: return matching_ta try: any_ta = next(ta for ta in common.flat([ get_all_tas(e, default_project, ta_folders) for e in possible_control_experiments ]) if ta and ta['id'] not in used_control_ids) except StopIteration: logging.error('Failed to find any possible control') return None else: return any_ta
def get_data(url, keypair): ''' Makes GET request. ''' logging.debug('Getting %s' % url) results = common.encoded_get(url, keypair) return results['@graph']
def biorep_ns(file_accession, server, keypair): m = re.match('^/?(files)?/?(\w*)', file_accession) if m: acc = m.group(2) else: return url = urlparse.urljoin(server, '/files/%s' % (acc)) file_object = common.encoded_get(url, keypair) if file_object.get('derived_from'): for f in file_object.get('derived_from'): for repnum in biorep_ns(f, server, keypair): yield repnum else: url = urlparse.urljoin(server, '%s' % (file_object.get('replicate'))) replicate_object = common.encoded_get(url, keypair) yield replicate_object.get('biological_replicate_number')
def s3cp(accession, key=None): (AUTHID, AUTHPW, SERVER) = common.processkey(key, KEYFILE) keypair = (AUTHID, AUTHPW) url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' % ( accession) #get the file object response = common.encoded_get(url, keypair) logger.debug(response) #select your file result = response.get('@graph') if not result: logger.error('Failed to find %s at %s' % (accession, url)) return None else: f_obj = result[0] logger.debug(f_obj) #make the URL that will get redirected - get it from the file object's href property encode_url = urlparse.urljoin(SERVER, f_obj.get('href')) logger.debug("URL: %s" % (encode_url)) logger.debug("%s:%s" % (AUTHID, AUTHPW)) #stream=True avoids actually downloading the file, but it evaluates the redirection r = requests.get(encode_url, auth=(AUTHID, AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True) try: r.raise_for_status except: logger.error('%s href does not resolve' % (f_obj.get('accession'))) logger.debug("Response: %s", (r)) #this is the actual S3 https URL after redirection s3_url = r.url logger.debug(s3_url) #release the connection r.close() #split up the url into components o = urlparse.urlparse(s3_url) #pull out the filename filename = os.path.basename(o.path) #hack together the s3 cp url (with the s3 method instead of https) bucket_url = S3_SERVER.rstrip('/') + o.path #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' % (filename))) dx_file = dxpy.upload_local_file(filename) return dx_file
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.rstrip() logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id)) r = requests.get(url, auth=keypair) try: r.raise_for_status() except: logger.error('%s failed to get metadata. GET returned %s' %(exp_id, r.return_code)) logger.debug('%s' %(r.text)) logger.error('Skipping ...') continue reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t') fieldnames = copy.copy(reader.fieldnames) # fieldnames.remove('Biological replicate(s)') # fieldnames.insert(4,'Biological replicate(s)') # fieldnames.remove('Biosample Age') # fieldnames.insert(10,'Biosample Age') fieldnames.append('Derived from') writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t') writer.writeheader() for file_metadata in reader: file_accession = file_metadata.get('File accession') url = urlparse.urljoin(server, 'files/%s' %(file_accession)) file_object = common.encoded_get(url, keypair) # bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair)))) # file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps]) # bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or "" # file_metadata.update({'Biosample Age': ",".join(bio_ages)}) if file_object.get('derived_from'): derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')]) else: derived_from = None file_metadata.update({'Derived from': derived_from}) #print file_metadata writer.writerow(file_metadata)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.rstrip() logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id)) r = requests.get(url, auth=keypair) try: r.raise_for_status() except: logger.error('%s failed to get metadata. GET returned %s' %(exp_id, r.return_code)) logger.debug('%s' %(r.text)) logger.error('Skipping ...') continue reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t') fieldnames = copy.copy(reader.fieldnames) fieldnames.remove('Biological replicate(s)') fieldnames.insert(4,'Biological replicate(s)') fieldnames.remove('Biosample Age') fieldnames.insert(10,'Biosample Age') fieldnames.append('Derived from') writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t') writer.writeheader() for file_metadata in reader: file_accession = file_metadata.get('File accession') url = urlparse.urljoin(server, 'files/%s' %(file_accession)) file_object = common.encoded_get(url, keypair) bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair)))) file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps]) bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or "" file_metadata.update({'Biosample Age': ",".join(bio_ages)}) if file_object.get('derived_from'): derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')]) else: derived_from = None file_metadata.update({'Derived from': derived_from}) #print file_metadata writer.writerow(file_metadata)
def get_control_id(experiment): # url = server + '/experiments/%s/' %(exp_id) # experiment = encoded_get(url, keypair) fastqs = common.encoded_get('') possible_controls = experiment.get('possible_controls') if not possible_controls or len(possible_controls) != 1: logging.error("Tried to find one possible control, found %s" % (possible_controls)) return None return possible_controls[0].get('accession')
def s3_dxcp(accession, key=None): (AUTHID,AUTHPW,SERVER) = common.processkey(key,KEYFILE) keypair = (AUTHID,AUTHPW) url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' %(accession) #get the file object response = common.encoded_get(url, keypair) logger.debug(response) #select your file result = response.get('@graph') if not result: logger.error('Failed to find %s at %s' %(accession, url)) return None else: f_obj = result[0] logger.debug(f_obj) #make the URL that will get redirected - get it from the file object's href property encode_url = urlparse.urljoin(SERVER,f_obj.get('href')) logger.debug("URL: %s" %(encode_url)) logger.debug("%s:%s" %(AUTHID, AUTHPW)) #stream=True avoids actually downloading the file, but it evaluates the redirection r = requests.get(encode_url, auth=(AUTHID,AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True) try: r.raise_for_status except: logger.error('%s href does not resolve' %(f_obj.get('accession'))) logger.debug("Response: %s", (r)) #this is the actual S3 https URL after redirection s3_url = r.url logger.debug(s3_url) #release the connection r.close() #split up the url into components o = urlparse.urlparse(s3_url) #pull out the filename filename = os.path.basename(o.path) #hack together the s3 cp url (with the s3 method instead of https) bucket_url = S3_SERVER.rstrip('/') + o.path #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' %(filename))) dx_file = dxpy.upload_local_file(filename) return dx_file
def biorep_ages(file_accession,server,keypair): m = re.match('^/?(files)?/?(\w*)', file_accession) if m: acc = m.group(2) else: return url = urlparse.urljoin(server, '/files/%s' %(acc)) file_object = common.encoded_get(url, keypair) if file_object.get('derived_from'): for f in file_object.get('derived_from'): for bioage in biorep_ages(f,server,keypair): yield bioage else: url = urlparse.urljoin(server, '%s' %(file_object.get('replicate'))) replicate_object = common.encoded_get(url, keypair) url = urlparse.urljoin(server, '%s' %(replicate_object.get('library'))) library_object = common.encoded_get(url, keypair) url = urlparse.urljoin(server, '%s' %(library_object.get('biosample'))) biosample_object = common.encoded_get(url, keypair) yield biosample_object.get('age_display')
def get_control_id(experiment): # url = server + '/experiments/%s/' %(exp_id) # experiment = encoded_get(url, keypair) fastqs = common.encoded_get('') possible_controls = experiment.get('possible_controls') if not possible_controls or len(possible_controls) != 1: logging.error("Tried to find one possible control, found %s" %(possible_controls)) return None return possible_controls[0].get('accession')
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.infile and args.experiments: experiments = args.experiments experiments.extend([e.strip() for e in args.infile if e.strip()]) elif args.infile: experiments = args.infile else: experiments = args.experiments for exp_id in experiments: uri = '/experiments/%s' %(exp_id) experiment = common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) if experiment.get('status') == 'error': print experiment print "Error fetching %s ... skipping" %(exp_id) continue print experiment.get('accession') for uri in experiment['original_files']: url = urlparse.urljoin(server,'%s' %(uri)) file_obj = common.encoded_get(url, keypair) print "%s, %s, %s, %s, %s, %s" %(file_obj.get('accession'),file_obj.get('file_type'),file_obj.get('file_format'),file_obj.get('file_format_type'),file_obj.get('output_type'),file_obj.get('status')) if file_obj.get('file_format') in ['bed', 'bigBed', 'bigWig']: if file_obj.get('status') != 'released' or args.force: patch_payload = {'status': args.status} if args.dryrun: print "--dryrun: would have patched %s" %(json.dumps(patch_payload)) else: r = requests.patch(url, auth=keypair, data=json.dumps(patch_payload), headers={'content-type': 'application/json', 'accept': 'application/json'}) try: r.raise_for_status() except: print(r.text) print('Patch failed: %s %s ... skipping' % (r.status_code, r.reason)) continue else: print "Patched %s" %(json.dumps(patch_payload))
def replicates_to_map(files, server, keypair, map_only_reps=[]): if not files: return [] else: replicate_objects = [] for f in files: replicate = common.encoded_get(urlparse.urljoin(server,f.get('replicate')),keypair) if not replicate in replicate_objects: if not map_only_reps or (map_only_reps and replicate['biological_replicate_number'] in map_only_reps): replicate_objects.append(replicate) return replicate_objects
def replicates_to_map(files, server, keypair, biorep_ns=[]): if not files: return [] else: replicate_objects = [] for f in files: replicate = common.encoded_get(urlparse.urljoin(server,f.get('replicate')),keypair) if not replicate in replicate_objects: if not biorep_ns or (biorep_ns and replicate['biological_replicate_number'] in biorep_ns): replicate_objects.append(replicate) return replicate_objects
def accession_mapping_analysis_files(mapping_analysis, keypair, server, dryrun, force): experiment_accession = get_experiment_accession(mapping_analysis) if not experiment_accession: logger.info("Missing experiment accession or rep in %s, skipping." %(mapping_analysis['name'])) return [] m = re.match('^Map (ENCSR[0-9]{3}[A-Z]{3}) rep(\d+)',mapping_analysis['name']) if m: repn = int(m.group(2)) else: logger.error("Missing rep in %s, skipping." %(mapping_analysis['name'])) return [] logger.info("%s rep %d: accessioning mapping." %(experiment_accession, repn)) experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) mapping_stages = get_mapping_stages(mapping_analysis, keypair, server, repn) output_files = accession_outputs(mapping_stages, experiment, keypair, server, dryrun, force) files_with_derived = patch_outputs(mapping_stages, keypair, server, dryrun) mapping_analysis_step_versions = { 'bwa-indexing-step-v-1' : [ { 'stages' : "", 'stage_name': "", 'file_names' : [], 'status' : 'finished', 'qc_objects': [] } ], 'bwa-alignment-step-v-1' : [ { 'stages' : mapping_stages, 'stage_name': 'Filter and QC*', 'file_names' : ['filtered_bam'], 'status' : 'finished', 'qc_objects' : [ {'chipseq_filter_quality_metric': ['filtered_bam']}, {'samtools_flagstats_quality_metric': ['filtered_bam']} ] } ] } patched_files = accession_pipeline(mapping_analysis_step_versions, keypair, server, dryrun, force) return patched_files
def patch_file(payload, keypair, server, dryrun): logger.debug('in patch_file with %s' %(pprint.pformat(payload))) accession = payload.pop('accession') url = urlparse.urljoin(server,'files/%s' %(accession)) if dryrun: logger.info("Dry run. Would PATCH: %s with %s" %(accession, pprint.pformat(payload))) logger.info("Dry run. Returning unchanged file object") new_file_object = common.encoded_get(urlparse.urljoin(server,'/files/%s' %(accession)), keypair) else: # r = requests.patch(url, auth=keypair, headers={'content-type': 'application/json'}, data=json.dumps(payload)) r = common.encoded_patch(url, keypair, payload, return_response=True) try: r.raise_for_status() except: logger.error('PATCH file object failed: %s %s' % (r.status_code, r.reason)) logger.error(r.text) new_file_object = None else: new_file_object = r.json()['@graph'][0] logger.info("Patched: %s" %(new_file_object.get('accession'))) return new_file_object
def accession_qc_object(obj_type, obj, keypair, server, dryrun, force): logger.debug('in accession_qc_object with obj_type %s obj.keys() %s' %(obj_type, obj.keys())) logger.debug('obj[step_run] %s' %(obj.get('step_run'))) url = urlparse.urljoin(server,'/search/?type=%s&step_run=%s' %(obj_type, obj.get('step_run'))) logger.debug('url %s' %(url)) r = common.encoded_get(url,keypair) objects = [o for o in r['@graph'] if o['status'] not in DEPRECATED] logger.debug('found %d qc objects of type %s' %(len(objects), obj_type)) existing_objects = [o for o in objects if o.get('step_run') == obj['step_run']] if existing_objects: existing_object = existing_objects.pop() else: existing_object = None for object_to_delete in existing_objects: url = urlparse.urljoin(server,object_to_delete['@id']) common.encoded_patch(url, keypair, {'status':'deleted'}) payload = json.dumps(obj) if existing_object: url = urlparse.urljoin(server, existing_object['@id']) logger.debug('patching %s with %s' %(url,payload)) # r = requests.patch(url, auth=keypair, headers={'content-type': 'application/json'}, data=payload) r = common.encoded_patch(url, keypair, obj, return_response=True) else: url = urlparse.urljoin(server, '/%s/' %(obj_type)) logger.debug('posting to %s with %s' %(url,payload)) # r = requests.post(url, auth=keypair, headers={'content-type': 'application/json'}, data=payload) r = common.encoded_post(url, keypair, obj, return_response=True) try: r.raise_for_status() except: logger.error('PATCH or POST failed: %s %s' % (r.status_code, r.reason)) logger.error('url was %s' %(url)) logger.error(r.text) new_qc_object = None else: new_qc_object = r.json()['@graph'][0] return new_qc_object
def accession_analysis_step_run(analysis_step_run_metadata, keypair, server, dryrun, force): url = urlparse.urljoin(server,'/analysis-step-runs/') if dryrun: logger.info("Dry run. Would POST %s" %(analysis_step_run_metadata)) new_object = {} else: # r = requests.post(url, auth=keypair, headers={'content-type': 'application/json'}, data=json.dumps(analysis_step_run_metadata)) r = common.encoded_post(url, keypair, analysis_step_run_metadata, return_response=True) try: r.raise_for_status() except: if r.status_code == 409: url = urlparse.urljoin(server,"/%s" %(analysis_step_run_metadata['aliases'][0])) #assumes there's only one alias new_object = common.encoded_get(url, keypair) logger.info('Using existing analysis_step_run object %s' %(new_object.get('@id'))) else: logger.warning('POST analysis_step_run object failed: %s %s' % (r.status_code, r.reason)) logger.warning(r.text) new_object = {} else: new_object = r.json()['@graph'][0] logger.info("New analysis_step_run uuid: %s" %(new_object.get('uuid'))) return new_object
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) if args.control: control_dxhandler = resolve_dx_file(args.control) else: control_dxhandler = None for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print("Experiment %s" % (exp_id)) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print("%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description'))) tas = get_tas(experiment, server, keypair, args.project, args.inf, control_dxhandler) if not tas: logging.error('Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue if not tas.get('rep2_ta'): simplicate_experiment = True print("Simplicate experiment ta's:") else: simplicate_experiment = False print("Replicated experiment ta's:") pprint(tas) # sys.exit() # continue for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/' + outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: logging.error("%s: Failed to determine target type ... skipping" % (exp_id)) continue else: print(investigated_as) rep1_pe = tas['rep1_ta']['paired_end'] if not simplicate_experiment: rep2_pe = tas['rep2_ta']['paired_end'] else: rep2_pe = None if simplicate_experiment and rep1_pe is None: logging.error( "%s: Cannot determine paired end: rep1 PE = %s... skipping" % (exp_id, rep1_pe)) continue elif not simplicate_experiment and None in [rep1_pe, rep2_pe]: logging.error( "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if not simplicate_experiment and rep1_pe != rep2_pe: logging.error( "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if any('histone' in target_type for target_type in investigated_as): logging.info( "%s: Found to be histone. No blacklist will be used." % (exp_id)) wf_target = 'histone' blacklist = None else: logging.info("Assumed to be tf") wf_target = 'tf' if not args.blacklist: if args.assembly in ASSEMBLY_METADATA: blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist'] else: logging.warning( "%s: No blacklist for assembly %s, proceeding with no blacklist" % (exp_id, args.assembly)) blacklist = None if not args.gsize: if args.assembly in ASSEMBLY_METADATA: genomesize = ASSEMBLY_METADATA[args.assembly]['gsize'] else: logging.error("%s: Must specify -gsize for assembly %s" % (exp_id, args.assembly)) else: genomesize = args.gsize if not args.csizes: if args.assembly in ASSEMBLY_METADATA: chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes'] else: logging.error("%s: Must specify -csizes for assembly %s" % (exp_id, args.assembly)) else: chrom_sizes = args.csizes chip_workflow_absolute_path = os.path.dirname( os.path.realpath(__file__)) + "/chip_workflow.py" command_strings = [ chip_workflow_absolute_path, '--nomap --yes', '--target %s' % (wf_target), '--title "%s"' % (workflow_title), '--outf "%s"' % (outf), '--rep1pe %s' % (str(rep1_pe).lower()), '--rep1 %s' % (tas['rep1_ta'].get('file_id')), '--ctl1 %s' % (tas['rep1_ta'].get('control_id')), '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes), '--spp_version %s' % (args.spp_version) ] if not simplicate_experiment: command_strings.extend([ '--rep2pe %s' % (str(rep2_pe).lower()), '--rep2 %s' % (tas['rep2_ta'].get('file_id')), '--ctl2 %s' % (tas['rep2_ta'].get('control_id')), ]) if args.fragment_length: command_strings.append('--fragment_length %s' % str(args.fragment_length)) if blacklist: command_strings.append('--blacklist "%s"' % (blacklist)) if args.debug: command_strings.append('--debug') if args.use_existing_folders: command_strings.append('--use_existing_folders') if args.accession: command_strings.append('--accession') if args.fqcheck is not None: command_strings.append('--fqcheck=%s' % (args.fqcheck)) if args.skip_control is not None: command_strings.append('--skip_control=%s' % (args.skip_control)) if args.force_patch is not None: command_strings.append('--force_patch=%s' % (args.force_patch)) run_command = ' '.join(command_strings) print(run_command) if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error( "%s: chip_workflow exited with non-zero code %d" % (exp_id, e.returncode)) else: print("%s workflow created" % (experiment['accession'])) logging.debug("%s: patching internal_status to url %s" % (exp_id, experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.warning( "%s: Failed to update experiment internal_status to processing. Skipping that update." % (exp_id)) logging.debug(r.text)
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: exp_ids = csv.reader( StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for row in exp_ids: if row[0].startswith('#'): continue exp_id = row[0].strip() if len(row) > 1: repns = [] for s in row[1:]: repns.extend(s.split(',')) map_only_reps = list(set([int(s) for s in repns])) else: map_only_reps = [] outstrings = [] encode_url = urlparse.urljoin(server, exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.no_sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, map_only_reps) biorep_numbers = \ set([rep.get('biological_replicate_number') for rep in replicates]) in_process = False if files: for biorep_n in biorep_numbers: outstrings.append('rep%s' % (biorep_n)) biorep_files = [ f for f in files if biorep_n in common.biorep_ns(f, server, keypair) ] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get( 'paired_end' ) == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1', '2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get( '@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get( 'paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' % (experiment.get('accession'), file_object.get('accession'))) mate = {} # if mapping as SE, ignore the mate and just map the # rep1 as SE with all the other SE for this rep, if any if args.force_se: unpaired_files.append( next(f for f in [file_object, mate] if f.get('paired_end') == '1')) else: paired_files.append((file_object, mate)) if biorep_files: logging.warning( '%s: leftover file(s) %s' % (experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = \ map_only(experiment, biorep_n, paired_files, server, keypair, args.sex_specific, args.crop_length, args.accession, args.fqcheck, args.force_patch, args.use_existing_folders, args.encoded_check) in_process = True if unpaired_files: se_jobs = \ map_only(experiment, biorep_n, unpaired_files, server, keypair, args.sex_specific, args.crop_length, args.accession, args.fqcheck, args.force_patch, args.use_existing_folders, args.encoded_check) in_process = True if paired_files and pe_jobs: outstrings.append( 'paired:%s' % ([(a.get('accession'), b.get('accession')) for (a, b) in paired_files])) outstrings.append('paired jobs:%s' % ([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' % (None)) if unpaired_files and se_jobs: outstrings.append( 'unpaired:%s' % ([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' % ([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' % (None)) if in_process: r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True) try: r.raise_for_status() except: logging.error("Tried and failed to set internal_status") logging.error(r.text) print('\t'.join(outstrings)) else: # no files if not replicates: logging.warning('%s: No files and no replicates' % experiment.get('accession')) else: logging.warning('%s: No files to map' % experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' % experiment.get('accession'))
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.query: r = requests.get( args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"} ) experiments = r.json()["@graph"] exp_ids = [e["accession"] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() logger.info("%s" % (exp_id)) url = urlparse.urljoin(server, "/experiments/%s" % (exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [ common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair) for uri in experiment_object.get("original_files") ] bams = [ f for f in original_files if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"] ] fastqs = [ f for f in original_files if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"] ] for f in fastqs: f["replicate"] = common.encoded_get(urlparse.urljoin(server, "%s" % (f.get("replicate"))), keypair) for bam in bams: bioreps = common.biorep_ns(bam.get("accession"), server, keypair) if len(bioreps) != 1: logger.error( "Expected to find 1 biorep for bam %s, found %d. Skipping." % (bam.get("accession"), len(bioreps)) ) continue else: bam_biorep = bioreps[0] try: derived_from = [ common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair) for uri in bam.get("derived_from") ] except: derived_from = None if not derived_from: logger.error("bam %s is derived from nothing. Skipping" % (bam.get("accession"))) continue for f in derived_from: if f.get("file_format") != "fastq": logger.error( "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." % (bam.get("accession"), f.get("accession")) ) continue try: if common.after(f.get("date_created"), bam.get("date_created")): logger.error( "Date conflict. Bam %s is derived from newer Fastq %s" % (bam.get("accession"), f.get("accession")) ) except: logger.error( "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." % (bam.get("date_created"), f.get("date_created")) ) continue for f in fastqs: if f.get("replicate").get("biological_replicate_number") == bam_biorep: if common.after(f.get("date_created"), bam.get("date_created")): logger.info( "bam %s is out-of-date. fastq %s is newer" % (bam.get("accession"), f.get("accession")) ) if re.search("control", experiment_object.get("target").lower()): logger.info( "WARNING, %s is a control experiment so many other experiments may be out-of-date." % (experiment_object.get("accession")) )
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # Use the default logging level. logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) if args.released: keypair = None server = PUBLIC_SERVER else: authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments elif args.all: # Get metadata for all ChIP-seq Experiments. base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released' extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready' exp_query = base_exp_query if args.released else (base_exp_query + extended_query) all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] # Extract Experiment accessions. ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: # Never reached because infile defaults to stdin. raise InputError('Must supply experiment ids' ' in arguments or --infile.') # Define column names for TSV. fieldnames = [ 'date', 'analysis', 'analysis_id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1', 'F2', 'state', 'release', 'total_price', 'quality_metric_of' ] if args.create_google_sheet: # Force creation of temporary CSV that can be loaded into a DataFrame, # written to Google Sheets, then deleted. temp_file = 'temp_idr_%s.tsv' % (args.assembly) args.outfile = open(temp_file, 'w') writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() # Get metadata for all IDR output Files. base_idr_query = ('/search/?type=File&assembly=%s&file_format=bed' '&output_type=optimal+idr+thresholded+peaks' '&output_type=conservative+idr+thresholded+peaks' '&output_type=pseudoreplicated+idr+thresholded+peaks' '&lab.title=ENCODE+Processing+Pipeline' '&lab.title=J.+Michael+Cherry,+Stanford' '&status=released' % (args.assembly)) extended_idr_query = '&status=in+progress&status=uploading&status=uploaded' idr_query = base_idr_query if args.released else (base_idr_query + extended_idr_query) all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] na = 'not_available' for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning("%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s.' ' Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error('%s: Expected one unique IDR metric,' ' found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error('%s: Expected one unique assembly, found %d.' ' Skipping.' % (experiment_id, len(assemblies))) continue # Grab unique value from set. idr_qc_uri = next(iter(idr_qc_uris)) assembly = next(iter(assemblies)) # Get analysis_id from DNAnexus, create analysis_link. idr_step_run_uri = next(iter(idr_step_runs)) try: idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) except Exception as e: print(experiment_id, e, 'Skipping.') continue try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get( 'dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id" ) logger.debug(idr_step_run) # Could try to pull it from alias. dx_job_id_str = None dx_job_id = dx_job_id_str.rpartition(':')[2] if not args.released: dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) else: analysis_link = na desc = {} # Get IDR object. idr = common.encoded_get(server + idr_qc_uri, keypair) # Pull metrics of interest. idr_status = idr.get('status', na) if (args.released and (idr_status == na or idr_status != 'released')): logger.error('%s: Expected released IDR metric. Skipping.' % idr_qc_uris) continue Np = idr.get('Np', na) N1 = idr.get('N1', na) N2 = idr.get('N2', na) Nt = idr.get('Nt', na) Fp = idr.get('Fp', na) F1 = idr.get('F1', na) F2 = idr.get('F2', na) Ft = idr.get('Ft', na) quality_metric_of = idr.get('quality_metric_of', []) date = idr.get('date_created', na) rescue_ratio = idr.get('rescue_ratio', na) self_consistency_ratio = idr.get('self_consistency_ratio', na) reproducibility_test = idr.get('reproducibility_test', na) # Get Experiment object. experiment = common.encoded_get(server + experiment_id, keypair) experiment_link = '%sexperiments/%s' % (server, experiment.get('accession')) # Get Award object. award = common.encoded_get(server + experiment.get('award'), keypair) # Grab project phase, e.g. ENCODE4. rfa = award.get('rfa', na) row = { 'date': date, 'analysis': analysis_link, 'analysis_id': desc.get('id', na), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'replication': experiment.get('replication_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': rfa, 'assembly': assembly, 'Nt': Nt, 'Np': Np, 'N1': N1, 'N2': N2, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'Ft': Ft, 'Fp': Fp, 'F1': F1, 'F2': F2, 'state': desc.get('state', na), 'release': experiment['status'], 'total_price': desc.get('totalPrice', na), 'quality_metric_of': ', '.join(quality_metric_of) } writer.writerow(row) if args.create_google_sheet: args.outfile.close() # Load CSV data, sort. idr_data = pd.read_table(temp_file) idr_data = idr_data.replace('not_available', '') idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x)) idr_data = idr_data.sort_values( by=['lab', 'biosample_term_name', 'target', 'experiment'], ascending=[True, True, True, True]) idr_data.date = idr_data.date.astype('str') idr_data = idr_data.reset_index(drop=True) # Read sheet title and create unique page title. date = datetime.now().strftime('%m_%d_%Y') sheet_title = (args.sheet_title if not args.released else '{} Released'.format(args.sheet_title)) page_title = '%s_IDR_FRIP_%s' % (args.assembly, date) # Open/create Google Sheet. gc = pygsheets.authorize(args.apikey) try: sh = gc.open(sheet_title) except pygsheets.exceptions.SpreadsheetNotFound: sh = gc.create(sheet_title) try: wks = sh.add_worksheet(page_title) except HttpError: wks = sh.worksheet_by_title(page_title) # Clear worksheet. wks.clear() # Add data from DataFrame. wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1') # Apply formatting and conditions. header['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, header) # Format numbers. for col in number_format_columns: num = idr_data.columns.get_loc(col) number_format['repeatCell']['range']['startColumnIndex'] = num number_format['repeatCell']['range']['endColumnIndex'] = num + 1 number_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, number_format) # Resize font. font_size_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format) # Add conditional formatting. for conditional in conditions: num = idr_data.columns.get_loc("reproducibility_test") conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'startColumnIndex'] = num conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'endColumnIndex'] = num + 1 conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, conditional) for k, v in notes_dict.items(): num = idr_data.columns.get_loc(k) note['repeatCell']['range']['startColumnIndex'] = num note['repeatCell']['range']['endColumnIndex'] = num + 1 note['repeatCell']['cell']['note'] = v note['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, note) # Optional. Smaller column width to match original. for i in range(wks.cols): wks.adjust_column_width(i, pixel_size=38) # Resize tiny columns. tiny_columns = ['experiment', 'analysis'] for i in [idr_data.columns.get_loc(x) for x in tiny_columns]: wks.adjust_column_width(i, pixel_size=25) # Resize medium columns. medium_columns = ['replication', 'assembly', 'rfa'] for i in [idr_data.columns.get_loc(x) for x in medium_columns]: wks.adjust_column_width(i, pixel_size=65) # Resize wide columns. wide_columns = ['target', 'reproducibility_test', 'lab'] for i in [idr_data.columns.get_loc(x) for x in wide_columns]: wks.adjust_column_width(i, pixel_size=85) # Remove temp file. os.remove(temp_file)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.query: r = requests.get(args.query, auth=keypair, headers={'content-type': 'application/json', 'accept': 'application/json'}) experiments = r.json()['@graph'] exp_ids = [e['accession'] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile logger.info('Checking %d experiments' % (len(exp_ids))) for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() #logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, '/experiments/%s' %(exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment_object.get('original_files')] bams = [f for f in original_files if f.get('file_format') == 'bam' and f.get('status') not in ['revoked','deleted','replaced']] fastqs = [f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') not in ['revoked','deleted','replaced']] for f in fastqs: f['replicate'] = common.encoded_get(urlparse.urljoin(server,'%s' %(f.get('replicate'))), keypair) for bam in bams: bioreps = common.biorep_ns(bam.get('accession'),server,keypair) if len(bioreps) != 1: logger.error("Expected to find 1 biorep for bam %s, found %s. Skipping." %(bam.get('accession'), bioreps)) continue else: bam_biorep = bioreps[0] try: derived_from = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in bam.get('derived_from')] except: derived_from = None if not derived_from: logger.error('bam %s is derived from nothing. Skipping' %(bam.get('accession'))) continue for f in derived_from: if f.get('output_category') == 'reference': continue if f.get('file_format') != 'fastq': logger.error("bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." %(bam.get('accession'), f.get('accession'))) continue try: if common.after(f.get('date_created'), bam.get('date_created')): logger.error("Date conflict. Bam %s is derived from newer Fastq %s" %(bam.get('accession'), f.get('accession'))) except: logger.error("Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." %(bam.get('date_created'), f.get('date_created'))) continue for f in fastqs: if f.get('replicate').get('biological_replicate_number') == bam_biorep: if common.after(f.get('date_created'), bam.get('date_created')): logger.info("bam %s is out-of-date. fastq %s is newer" %(bam.get('accession'), f.get('accession'))) if re.search('control',experiment_object.get('target').lower()): logger.info("WARNING, %s is a control experiment so many other experiments may be out-of-date." %(experiment_object.get('accession')))
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) if args.control: control_dxhandler = resolve_dx_file(args.control) else: control_dxhandler = None for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print("Experiment %s" % (exp_id)) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print( "%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description'))) tas = get_tas(experiment, server, keypair, args.project, args.inf, control_dxhandler) if not tas: logging.error( 'Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue if not tas.get('rep2_ta'): simplicate_experiment = True print("Simplicate experiment ta's:") else: simplicate_experiment = False print("Replicated experiment ta's:") pprint(tas) # sys.exit() # continue for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/'+outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: logging.error( "%s: Failed to determine target type ... skipping" % (exp_id)) continue else: print(investigated_as) rep1_pe = tas['rep1_ta']['paired_end'] if not simplicate_experiment: rep2_pe = tas['rep2_ta']['paired_end'] else: rep2_pe = None if simplicate_experiment and rep1_pe is None: logging.error( "%s: Cannot determine paired end: rep1 PE = %s... skipping" % (exp_id, rep1_pe)) continue elif not simplicate_experiment and None in [rep1_pe, rep2_pe]: logging.error( "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if not simplicate_experiment and rep1_pe != rep2_pe: logging.error( "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if any('histone' in target_type for target_type in investigated_as): logging.info( "%s: Found to be histone. No blacklist will be used." % (exp_id)) wf_target = 'histone' blacklist = None else: logging.info("Assumed to be tf") wf_target = 'tf' if not args.blacklist: if args.assembly in ASSEMBLY_METADATA: blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist'] else: logging.warning( "%s: No blacklist for assembly %s, proceeding with no blacklist" % (exp_id, args.assembly)) blacklist = None if not args.gsize: if args.assembly in ASSEMBLY_METADATA: genomesize = ASSEMBLY_METADATA[args.assembly]['gsize'] else: logging.error( "%s: Must specify -gsize for assembly %s" % (exp_id, args.assembly)) else: genomesize = args.gsize if not args.csizes: if args.assembly in ASSEMBLY_METADATA: chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes'] else: logging.error( "%s: Must specify -csizes for assembly %s" % (exp_id, args.assembly)) else: chrom_sizes = args.csizes chip_workflow_absolute_path = os.path.dirname(os.path.realpath(__file__)) + "/chip_workflow.py" command_strings = [ chip_workflow_absolute_path, '--nomap --yes', '--target %s' % (wf_target), '--title "%s"' % (workflow_title), '--outf "%s"' % (outf), '--rep1pe %s' % (str(rep1_pe).lower()), '--rep1 %s' % (tas['rep1_ta'].get('file_id')), '--ctl1 %s' % (tas['rep1_ta'].get('control_id')), '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes), '--spp_version %s' % (args.spp_version) ] if not simplicate_experiment: command_strings.extend([ '--rep2pe %s' % (str(rep2_pe).lower()), '--rep2 %s' % (tas['rep2_ta'].get('file_id')), '--ctl2 %s' % (tas['rep2_ta'].get('control_id')), ]) if args.spp_instance: command_strings.append('--spp_instance %s' % str(args.spp_instance)) if args.fragment_length: command_strings.append('--fragment_length %s' % str(args.fragment_length)) if blacklist: command_strings.append('--blacklist "%s"' % (blacklist)) if args.debug: command_strings.append('--debug') if args.use_existing_folders: command_strings.append('--use_existing_folders') if args.accession: command_strings.append('--accession') if args.fqcheck is not None: command_strings.append('--fqcheck=%s' % (args.fqcheck)) if args.skip_control is not None: command_strings.append('--skip_control=%s' % (args.skip_control)) if args.force_patch is not None: command_strings.append('--force_patch=%s' % (args.force_patch)) run_command = ' '.join(command_strings) print(run_command) if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error( "%s: chip_workflow exited with non-zero code %d" % (exp_id, e.returncode)) else: print("%s workflow created" % (experiment['accession'])) logging.debug( "%s: patching internal_status to url %s" % (exp_id, experiment_url)) r = common.encoded_patch( experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.warning( "%s: Failed to update experiment internal_status to processing. Skipping that update." % (exp_id)) logging.debug(r.text)
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print "Experiment %s" % (exp_id) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print "%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description')) # ctl_id = get_control_id(experiment) # if ctl_id: # print "Control %s" %(ctl_id) # else: # print "Found no control ... skipping %s" %(exp_id) # continue # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf) # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf) tas = get_tas(experiment, server, keypair, args.project, args.inf) if not tas: logging.error('Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue pprint.pprint(tas) # sys.exit() #continue skip_flag = False for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) skip_flag = True if skip_flag: continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/' + outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: print "%s: Failed to determine target type ... skipping" % (exp_id) continue else: print investigated_as rep1_pe = tas['rep1_ta']['paired_end'] rep2_pe = tas['rep2_ta']['paired_end'] if None in [rep1_pe, rep2_pe]: print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if rep1_pe != rep2_pe: print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if any('histone' in target_type for target_type in investigated_as): print "Found to be histone. No blacklist will be used." IDR_default = False workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py' blacklist = None else: print "Assumed to be tf" IDR_default = True workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py' if args.assembly == "hg19": blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz" else: print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" % ( args.assembly) blacklist = None run_command = \ '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \ '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \ '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \ '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \ '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes) if blacklist: run_command += ' --blacklist "%s"' % (blacklist) if args.debug: run_command += ' --debug' if args.idr or IDR_default: run_command += ' --idr --idrversion %s' % (args.idrversion) print run_command if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error("%s exited with non-zero code %d" % (workflow_spinner, e.returncode)) else: print "%s workflow created" % (experiment['accession']) logging.debug("patching internal_status to url %s" % (experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.error( "Tried but failed to update experiment internal_status to processing" ) logging.error(r.text)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses(name="ENCSR*", name_mode='glob', state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after))) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith( 'ENCSR783QUL Peaks') ] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply analysis id's in arguments, --infile or supply search string in --created_after" ) fieldnames = [ 'date', 'analysis', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def analysis_files(analysis_id, keypair, server, assembly): analysis_id = analysis_id.strip() analysis = dxpy.describe(analysis_id) project = analysis.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', analysis['executableName']) if m: experiment_accession = m.group(1) else: logger.info("No accession in %s, skipping." % (analysis['executableName'])) return experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) bams = get_rep_bams(experiment, keypair, server) rep1_bam = bams[0]['accession'] rep2_bam = bams[1]['accession'] common_metadata = { 'assembly': assembly, 'lab': 'encode-processing-pipeline', 'award': 'U41HG006992', } narrowpeak_metadata = common.merge_dicts( { 'file_format': 'bed_narrowPeak', 'file_format_specifications': ['ENCODE:narrowPeak.as'], 'output_type': 'peaks' }, common_metadata) replicated_narrowpeak_metadata = common.merge_dicts( { 'file_format': 'bed_narrowPeak', 'file_format_specifications': ['ENCODE:narrowPeak.as'], 'output_type': 'replicated peaks' }, common_metadata) gappedpeak_metadata = common.merge_dicts( { 'file_format': 'bed_gappedPeak', 'file_format_specifications': ['ENCODE:gappedPeak.as'], 'output_type': 'peaks' }, common_metadata) replicated_gappedpeak_metadata = common.merge_dicts( { 'file_format': 'bed_gappedPeak', 'file_format_specifications': ['ENCODE:gappedPeak.as'], 'output_type': 'replicated peaks' }, common_metadata) narrowpeak_bb_metadata = common.merge_dicts( { 'file_format': 'narrowPeak', 'file_format_specifications': ['ENCODE:narrowPeak.as'], 'output_type': 'peaks' }, common_metadata) replicated_narrowpeak_bb_metadata = common.merge_dicts( { 'file_format': 'narrowPeak', 'file_format_specifications': ['ENCODE:narrowPeak.as'], 'output_type': 'replicated peaks' }, common_metadata) gappedpeak_bb_metadata = common.merge_dicts( { 'file_format': 'gappedPeak', 'file_format_specifications': ['ENCODE:gappedPeak.as'], 'output_type': 'peaks' }, common_metadata) replicated_gappedpeak_bb_metadata = common.merge_dicts( { 'file_format': 'gappedPeak', 'file_format_specifications': ['ENCODE:gappedPeak.as'], 'output_type': 'replicated peaks' }, common_metadata) fc_signal_metadata = common.merge_dicts( { 'file_format': 'bigWig', 'output_type': 'fold change over control' }, common_metadata) pvalue_signal_metadata = common.merge_dicts( { 'file_format': 'bigWig', 'output_type': 'signal p-value' }, common_metadata) stage_outputs = { "ENCODE Peaks": { 'files': [ common.merge_dicts( { 'name': 'rep1_narrowpeaks', 'derived_from': [rep1_bam] }, narrowpeak_metadata), common.merge_dicts( { 'name': 'rep2_narrowpeaks', 'derived_from': [rep2_bam] }, narrowpeak_metadata), common.merge_dicts( { 'name': 'pooled_narrowpeaks', 'derived_from': [rep1_bam, rep2_bam] }, narrowpeak_metadata), common.merge_dicts( { 'name': 'rep1_narrowpeaks_bb', 'derived_from': [rep1_bam] }, narrowpeak_bb_metadata), common.merge_dicts( { 'name': 'rep2_narrowpeaks_bb', 'derived_from': [rep2_bam] }, narrowpeak_bb_metadata), common.merge_dicts( { 'name': 'pooled_narrowpeaks_bb', 'derived_from': [rep1_bam, rep2_bam] }, narrowpeak_bb_metadata), common.merge_dicts( { 'name': 'rep1_gappedpeaks', 'derived_from': [rep1_bam] }, gappedpeak_metadata), common.merge_dicts( { 'name': 'rep2_gappedpeaks', 'derived_from': [rep2_bam] }, gappedpeak_metadata), common.merge_dicts( { 'name': 'pooled_gappedpeaks', 'derived_from': [rep1_bam, rep2_bam] }, gappedpeak_metadata), common.merge_dicts( { 'name': 'rep1_gappedpeaks_bb', 'derived_from': [rep1_bam] }, gappedpeak_bb_metadata), common.merge_dicts( { 'name': 'rep2_gappedpeaks_bb', 'derived_from': [rep2_bam] }, gappedpeak_bb_metadata), common.merge_dicts( { 'name': 'pooled_gappedpeaks_bb', 'derived_from': [rep1_bam, rep2_bam] }, gappedpeak_bb_metadata), common.merge_dicts( { 'name': 'rep1_pvalue_signal', 'derived_from': [rep1_bam] }, pvalue_signal_metadata), common.merge_dicts( { 'name': 'rep2_pvalue_signal', 'derived_from': [rep2_bam] }, pvalue_signal_metadata), common.merge_dicts( { 'name': 'pooled_pvalue_signal', 'derived_from': [rep1_bam, rep2_bam] }, pvalue_signal_metadata), common.merge_dicts( { 'name': 'rep1_fc_signal', 'derived_from': [rep1_bam] }, fc_signal_metadata), common.merge_dicts( { 'name': 'rep2_fc_signal', 'derived_from': [rep2_bam] }, fc_signal_metadata), common.merge_dicts( { 'name': 'pooled_fc_signal', 'derived_from': [rep1_bam, rep2_bam] }, fc_signal_metadata) ], 'qc': [] }, "Overlap narrowpeaks": { 'files': [ common.merge_dicts( { 'name': 'overlapping_peaks', 'derived_from': [rep1_bam, rep2_bam] }, replicated_narrowpeak_metadata), common.merge_dicts( { 'name': 'overlapping_peaks_bb', 'derived_from': [rep1_bam, rep2_bam] }, replicated_narrowpeak_bb_metadata) ], 'qc': ['npeaks_in', 'npeaks_out', 'npeaks_rejected'] }, "Overlap gappedpeaks": { 'files': [ common.merge_dicts( { 'name': 'overlapping_peaks', 'derived_from': [rep1_bam, rep2_bam] }, replicated_gappedpeak_metadata), common.merge_dicts( { 'name': 'overlapping_peaks_bb', 'derived_from': [rep1_bam, rep2_bam] }, replicated_gappedpeak_bb_metadata) ], 'qc': ['npeaks_in', 'npeaks_out', 'npeaks_rejected'] } } experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) rep1_bam, rep2_bam = get_rep_bams(experiment, keypair, server) files = [] for (stage_name, outputs) in stage_outputs.iteritems(): stage_metadata = next(s['execution'] for s in analysis.get('stages') if s['execution']['name'] == stage_name) for static_metadata in outputs['files']: output_name = static_metadata['name'] dx = dxpy.DXFile(stage_metadata['output'][output_name], project=project) file_metadata = { 'dx': dx, 'notes': { 'dx-id': dx.get_id(), 'dx-createdBy': { 'job': stage_metadata['id'], 'executable': stage_metadata['executable'], #todo get applet ID 'user': stage_metadata['launchedBy'] }, 'qc': dict( zip(outputs['qc'], [ stage_metadata['output'][metric] for metric in outputs['qc'] ])) }, #'aliases': ['ENCODE:%s-%s' %(experiment.get('accession'), static_metadata.pop('name'))], 'dataset': experiment.get('accession'), 'file_size': dx.describe().get('size'), 'submitted_file_name': dx.get_proj_id() + ':' + '/'.join([dx.folder, dx.name]) } file_metadata.update(static_metadata) files.append(file_metadata) return files
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # use the defaulf logging level logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: ids = args.experiments # elif args.created_after: # analyses = [] # for state in args.state: # analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) # ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.all: exp_query = \ "/search/?type=Experiment" + \ "&assay_title=ChIP-seq" + \ "&award.project=ENCODE" + \ "&status=released&status=submitted&status=in+progress&status=started&status=release+ready" all_experiments = common.encoded_get(server+exp_query, keypair)['@graph'] ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply experiment id's in arguments or --infile") fieldnames = [ 'date','analysis','analysis id','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','release','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() idr_query = \ "/search/?type=File" + \ "&assembly=%s" % (args.assembly) + \ "&file_format=bed" + \ "&output_type=optimal+idr+thresholded+peaks" + \ "&output_type=conservative+idr+thresholded+peaks" + \ "&lab.title=ENCODE+Processing+Pipeline" + \ "&lab.title=J.+Michael+Cherry,+Stanford" + \ "&status=in+progress&status=released&status=uploading&status=uploaded" all_idr_files = common.encoded_get(server+idr_query, keypair)['@graph'] for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning( "%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s. Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error( '%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error( '%s: Expected one unique IDR metric, found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error( '%s: Expected one unique assembly, found %d. Skipping.' % (experiment_id, len(assemblies))) continue assembly = next(iter(assemblies)) idr_step_run_uri = next(iter(idr_step_runs)) idr_step_run = common.encoded_get(server+idr_step_run_uri, keypair) try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get('dx_job_id') except: logger.warning("Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id") logger.debug(idr_step_run) dx_job_id_str = None #could try to pull it from alias dx_job_id = dx_job_id_str.rpartition(':')[2] dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue if args.all: # we've already gotten all the experiment objects experiment = \ next(e for e in all_experiments if e['accession'] == experiment_accession) else: experiment = \ common.encoded_get(urlparse.urljoin( server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logger.error('Failed to find final IDR stage in %s' %(analysis_id)) else: if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'] for stage_name in idr_stage_names: try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r'Peak files must contain at least 20 peaks post-merge'] for p in patterns: m = re.search(p,job_log) if m: notes.append("%s: %s" %(stage_name,m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') notes = "IDR Complete" try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") except StopIteration: done_time = None except: raise if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = '%sexperiments/%s' %(server, experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'analysis id': desc.get('id'), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': assembly, 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'release': experiment['status'], 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server,exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.no_sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) in_process = False if files: for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' %(biorep_n)) biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1','2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession'))) mate = {} # if mapping as SE, ignore the mate and just map the # rep1 as SE with all the other SE for this rep, if any if args.force_se: unpaired_files.append(next( f for f in [file_object, mate] if f.get('paired_end') == '1')) else: paired_files.append((file_object, mate)) if biorep_files: logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair, args.sex_specific) in_process = True if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair, args.sex_specific) in_process = True if paired_files and pe_jobs: outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files])) outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' %(None)) if unpaired_files and se_jobs: outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' %(None)) if in_process: r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True) try: r.raise_for_status() except: logging.error("Tried and failed to set internal_status") logging.error(r.text) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' %experiment.get('accession')) else: logging.warning('%s: No files to map' %experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): # Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server + uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" # (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) elif 'control' in target_uri.split('/')[2]: logging.warning( '%s: possible control %s has target %s. Such controls are allowed but deprecated.' % (experiment.get('accession'), possible_control_experiment.get('accession'), target_uri)) possible_control_experiments.append(possible_control_experiment) else: logging.warning( '%s: possible control %s has target %s, not "Control". Skipping.' % (experiment.get('accession'), possible_control_experiment.get('accession'), target_uri)) logging.debug(pformat(possible_control_experiments)) try: matching_ta = \ next(ta for ta in [get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None except: raise else: return matching_ta tas = [] for e in possible_control_experiments: unused_tas = [ ta for ta in get_all_tas(e, default_project, ta_folders) if ta and ta['id'] not in used_control_ids ] logging.debug('get_possible_ctl_ta: experiment %s unused_tas %s' % (e.get('accession'), unused_tas)) if unused_tas: tas.extend(unused_tas) if len(tas) > 1: logging.warning('Found multiple tas %s, returning first one' % ([ ta.get('project') + ':' + ta.get('folder') + '/' + ta.get('name') for ta in tas ])) if tas: return tas[0] else: logging.warning( 'Failed to find any possible controls that have not already been used' ) tas = [] for e in possible_control_experiments: all_tas = [ ta for ta in get_all_tas(e, default_project, ta_folders) if ta ] logging.debug('get_possible_ctl_ta: experiment %s all_tas %s' % (e.get('accession'), all_tas)) if all_tas: tas.extend(all_tas) if len(tas) > 1: logging.warning('Found multiple tas %s, returning first one' % ([ ta.get('project') + ':' + ta.get('folder') + '/' + ta.get('name') for ta in tas ])) if tas: return tas[0] else: logging.error('Failed to find any possible control_tas') return None
def accession_peaks_analysis_files(peaks_analysis, keypair, server, dryrun, force): # m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',peaks_analysis['executableName']) # if m: # experiment_accession = m.group(1) # logger.info(experiment_accession) experiment_accession = get_experiment_accession(peaks_analysis) if experiment_accession: logger.info('%s: accession peaks' %(experiment_accession)) else: logger.error("No experiment accession in %s, skipping." %(peaks_analysis['executableName'])) return None #returns the experiment object experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) #returns a list with two elements: the mapping stages for [rep1,rep2] #in this context rep1,rep2 are the first and second replicates in the pipeline. They may have been accessioned #on the portal with any arbitrary biological_replicate_numbers. mapping_stages = get_peak_mapping_stages(peaks_analysis, experiment, keypair, server) if not mapping_stages: logger.error("Failed to find peak mapping stages") return None #returns a list with three elements: the mapping stages for the controls for [rep1, rep2, pooled] #the control stages for rep1 and rep2 might be the same as the pool if the experiment used pooled controls control_stages = get_control_mapping_stages(peaks_analysis, experiment, keypair, server) if not control_stages: logger.error("Failed to find control mapping stages") return None #returns the stages for peak calling peak_stages = get_peak_stages(peaks_analysis, mapping_stages, control_stages, experiment, keypair, server) if not peak_stages: logger.error("Failed to find peak stages") return None #accession all the output files output_files = [] for stages in [control_stages[0], control_stages[1], mapping_stages[0], mapping_stages[1], peak_stages]: logger.info('accessioning output') output_files.extend(accession_outputs(stages, experiment, keypair, server, dryrun, force)) #now that we have file accessions, loop again and patch derived_from files_with_derived = [] for stages in [control_stages[0], control_stages[1], mapping_stages[0], mapping_stages[1], peak_stages]: files_with_derived.extend(patch_outputs(stages, keypair, server, dryrun)) full_analysis_step_versions = { 'bwa-indexing-step-v-1' : [ { 'stages' : "", 'stage_name': "", 'file_names' : [], 'status' : 'finished', 'qc_objects': [] } ], 'bwa-alignment-step-v-1' : [ { 'stages' : control_stages[0], 'stage_name': 'Filter and QC*', 'file_names' : ['filtered_bam'], 'status' : 'finished', 'qc_objects' : [ {'chipseq_filter_quality_metric': ['filtered_bam']}, {'samtools_flagstats_quality_metric': ['filtered_bam']} ] }, { 'stages' : control_stages[1], 'stage_name': 'Filter and QC*', 'file_names' : ['filtered_bam'], 'status' : 'finished', 'qc_objects' : [ {'chipseq_filter_quality_metric': ['filtered_bam']}, {'samtools_flagstats_quality_metric': ['filtered_bam']} ] }, { 'stages' : mapping_stages[0], 'stage_name': 'Filter and QC*', 'file_names' : ['filtered_bam'], 'status' : 'finished', 'qc_objects' : [ {'chipseq_filter_quality_metric': ['filtered_bam']}, {'samtools_flagstats_quality_metric': ['filtered_bam']} ] }, { 'stages' : mapping_stages[1], 'stage_name': 'Filter and QC*', 'file_names' : ['filtered_bam'], 'status' : 'finished', 'qc_objects' : [ {'chipseq_filter_quality_metric': ['filtered_bam']}, {'samtools_flagstats_quality_metric': ['filtered_bam']} ] } ], 'histone-peak-calling-step-v-1' : [ { 'stages' : peak_stages, 'stage_name': 'ENCODE Peaks', 'file_names' : ['rep1_fc_signal', 'rep2_fc_signal', 'pooled_fc_signal', 'rep1_pvalue_signal', 'rep2_pvalue_signal', 'pooled_pvalue_signal', 'rep1_narrowpeaks', 'rep2_narrowpeaks', 'pooled_narrowpeaks'], 'status' : 'finished', 'qc_objects': [] } ], 'histone-overlap-peaks-step-v-1' : [ { 'stages' : peak_stages, 'stage_name': 'Overlap narrowpeaks', 'file_names' : ['overlapping_peaks'], 'status' : 'finished', 'qc_objects': [] } ], 'histone-peaks-to-bigbed-step-v-1' : [ { 'stages' : peak_stages, 'stage_name': 'ENCODE Peaks', 'file_names' : ['rep1_narrowpeaks_bb', 'rep2_narrowpeaks_bb', 'pooled_narrowpeaks_bb'], 'status' : 'virtual', 'qc_objects': [] } ], 'histone-replicated-peaks-to-bigbed-step-v-1' : [ { 'stages' : peak_stages, 'stage_name': 'Overlap narrowpeaks', 'file_names' : ['overlapping_peaks_bb'], 'status' : 'virtual', 'qc_objects': [] } ] } patched_files = accession_pipeline(full_analysis_step_versions, keypair, server, dryrun, force) return patched_files
def get_mapping_stages(mapping_analysis, keypair, server, repn): logger.debug('in get_mapping_stages with mapping analysis %s and rep %s' %(mapping_analysis['id'], repn)) experiment_accession = get_experiment_accession(mapping_analysis) experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) experiment_fastqs = get_rep_fastqs(experiment, keypair, server, repn) experiment_fastq_accessions = [f.get('accession') for f in experiment_fastqs] logger.info('%s: Found accessioned experiment fastqs with accessions %s' %(experiment_accession, experiment_fastq_accessions)) mapping_stages = mapping_analysis.get('stages') input_stage = next(stage for stage in mapping_stages if stage['execution']['name'].startswith("Gather inputs")) input_fastq_accessions = input_stage['execution']['input']['reads1'] if input_stage['execution']['input']['reads2']: input_fastq_accessions.append(input_stage['execution']['input']['reads2']) fastqs = [] for acc in input_fastq_accessions: fobj = common.encoded_get(urlparse.urljoin(server,'files/%s' %(acc)), keypair) # logger.debug('fobj') # logger.debug('%s' %(pprint.pprint(fobj))) fastqs.append(fobj) logger.info('Found input fastq objects with accessions %s' %([f.get('accession') for f in fastqs])) #Error if it appears we're trying to accession an out-dated analysis (i.e. one not derived from proper fastqs ... maybe some added or revoked) if cmp(sorted(flat(experiment_fastq_accessions)), sorted(flat(input_fastq_accessions))): logger.error('%s rep%d: Accessioned experiment fastqs differ from analysis. Experiment probably needs remapping' %(experiment_accession, repn)) return None filter_qc_stage = next(stage for stage in mapping_stages if stage['execution']['name'].startswith("Filter and QC")) bam = dxpy.describe(filter_qc_stage['execution']['output']['filtered_bam']) #here we get the actual DNAnexus file that was used as the reference reference_file = dxpy.describe(input_stage['execution']['output']['output_JSON']['reference_tar']) #and construct the alias to find the corresponding file at ENCODEd reference_alias = "dnanexus:" + reference_file.get('id') logger.debug('looking for reference file with alias %s' %(reference_alias)) reference = common.encoded_get(urlparse.urljoin(server,'files/%s' %(reference_alias)), keypair) if reference: logger.debug('found reference file %s' %(reference.get('accession'))) else: logger.error('failed to find reference file %s' %(reference_alias)) bam_metadata = common.merge_dicts({ 'file_format': 'bam', 'output_type': 'alignments' }, common_metadata) rep_mapping_stages = { "Filter and QC*" : { 'input_files': [ {'name': 'rep%s_fastqs' %(repn), 'derived_from': None, 'metadata': None, 'encode_object': fastqs}, {'name': 'reference', 'derived_from': None, 'metadata': None, 'encode_object': reference} ], 'output_files': [ {'name': 'filtered_bam', 'derived_from': ['rep%s_fastqs' %(repn),'reference'], 'metadata': bam_metadata} ], 'qc': [], 'stage_metadata': {} #initialized below }, "Calculate cross-correlation*": { 'input_files': [], 'output_files': [], 'qc': [], 'stage_metadata': {} } } for stage_name in rep_mapping_stages: if not stage_name.startswith('_'): rep_mapping_stages[stage_name].update({'stage_metadata': get_stage_metadata(mapping_analysis, stage_name)}) return rep_mapping_stages
def accession_file(f, keypair, server, dryrun, force): #check for duplication #- if it has ENCFF or TSTFF number in it's tag, or #- if there exists an accessioned file with the same submitted_file_name that is not deleted, replaced, revoked and has the same size #- then there should be a file with the same md5. If not, warn of a mismatch between what's at DNAnexus and ENCODEd. #- If same md5, return the existing object. #- Next, check if there's already a file with the same md5. If it's deleted, replaced, revoked, then remodel it if --force=true, #- Else warn and return None #download #calculate md5 and add to f.md5sum #post file and get accession, upload credentials #upload to S3 #remove the local file (to save space) #return the ENCODEd file object logger.debug('in accession_file with f %s' %(pprint.pformat(f['submitted_file_name']))) dx = f.pop('dx') local_fname = dx.name logger.info("Downloading %s" %(local_fname)) dxpy.download_dxfile(dx.get_id(),local_fname) f.update({'md5sum': common.md5(local_fname)}) f['notes'] = json.dumps(f.get('notes')) #check to see if md5 already in the database url = server + '/md5:%s?format=json&frame=object' %(f.get('md5sum')) r = common.encoded_get(url, keypair, return_response=True) try: r.raise_for_status() except: if r.status_code == 404: logger.info('No md5 matches %s' %(f.get('md5sum'))) md5_exists = False else: logger.error('MD5 duplicate check. GET failed: %s %s' % (r.status_code, r.reason)) logger.error(r.text) md5_exists = None else: md5_exists = r.json() #check if an ENCODE accession number in in the list of tags, as it would be if accessioned by this script or similar scripts for tag in dx.tags: m = re.findall(r'ENCFF\d{3}\D{3}', tag) if m: logger.info('%s appears to contain ENCODE accession number in tag %s.' %(dx.get_id(),m)) accession_in_tag = True # if not force: # return else: accession_in_tag = False #TODO check here if file is deprecated and, if so, warn if md5_exists: if force: return patch_file(f, keypair, server, dryrun) else: logger.info("Returning duplicate file unchanged") return md5_exists else: logger.info('posting new file %s' %(f.get('submitted_file_name'))) logger.debug('%s' %(f)) new_file_object = post_file(f, keypair, server, dryrun) if new_file_object: creds = new_file_object['upload_credentials'] env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': creds['access_key'], 'AWS_SECRET_ACCESS_KEY': creds['secret_key'], 'AWS_SECURITY_TOKEN': creds['session_token'], }) logger.info("Uploading file.") start = time.time() try: subprocess.check_call(['aws', 's3', 'cp', local_fname, creds['upload_url'], '--quiet'], env=env) except subprocess.CalledProcessError as e: # The aws command returns a non-zero exit code on error. logger.error("Upload failed with exit code %d" % e.returncode) else: end = time.time() duration = end - start logger.info("Uploaded in %.2f seconds" % duration) dx.add_tags([new_file_object.get('accession')]) try: os.remove(local_fname) except: pass return new_file_object
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids else: ids = args.infile formats = ['bed_narrowPeak', 'bed_gappedPeak'] fieldnames = [ 'file', 'analysis', 'experiment', 'replicates', 'output_name', 'file_format', 'output_type', 'target', 'biosample_term_name', 'biosample_term_id', 'biosample_type', 'biosample_life_stage', 'biosample_age', 'biosample_organism' ] writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t') writer.writeheader() for (i, analysis_id) in enumerate(ids): analysis_id = analysis_id.rstrip() logger.info('%s' % (analysis_id)) try: files = analysis_files(analysis_id, keypair, server, args.assembly) except: logger.error( '%s error finding analysis_files. Check experiment metadata.' % (analysis_id)) for f in [ f_obj for f_obj in files if f_obj.get('file_format') in formats ]: fid = f['dx'].get_id() local_path = os.path.join(args.outdir, fid) if not os.path.isfile(local_path): if not os.path.exists(args.outdir): os.makedirs(args.outdir) dxpy.download_dxfile(fid, local_path) replicates = [] for derived_from in f['derived_from']: rep_ns = common.biorep_ns(derived_from, server, keypair) for r in rep_ns: replicates.append(r) experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (f['dataset'])), keypair) rep = common.encoded_get( urlparse.urljoin(server, experiment['replicates'][0]), keypair) lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair) biosample = common.encoded_get( urlparse.urljoin(server, lib['biosample']), keypair) writer.writerow({ 'file': fid, 'analysis': analysis_id, 'experiment': experiment.get('accession'), 'replicates': replicates, 'output_name': f.get('name'), 'file_format': f.get('file_format'), 'output_type': f.get('output_type'), 'target': experiment.get('target'), 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_term_id': experiment.get('biosample_term_id'), 'biosample_type': experiment.get('biosample_type'), 'biosample_life_stage': biosample.get('life_stage'), 'biosample_age': biosample.get('age'), 'biosample_organism': biosample.get('organism') })
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug( "Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' %(exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' %(len(possible_files))) logging.debug('%s' %([(f.get('folder'),f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$',f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" %(repn)) if repn in repns: logging.warning("Ignoring additional rep%d bam, using first found" %(repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" %(repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" %(f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' %(repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" %(exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [common.encoded_get(server+'/files/%s/' %(f), keypair) for f in get_encffs(possible_files[i].get('name'))] controlled_by = common.flat([f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list(set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error("%s: Could not find controlled_by_ta for accessions %s" %(experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning('%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' %(experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error('%s: Could not find controlled_by or resolve possible_controls for rep%d' %(experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' %(controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update( {'rep%d_ta' %(ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } } ) return tas
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: exp_ids = csv.reader( StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server, exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) if files: for biorep_n in set( [rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' % (biorep_n)) biorep_files = [ f for f in files if biorep_n in common.biorep_ns(f, server, keypair) ] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get( 'paired_end' ) == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1', '2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get( '@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get( 'paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' % (experiment.get('accession'), file_object.get('accession'))) mate = {} paired_files.append((file_object, mate)) if biorep_files: logging.warning( '%s: leftover file(s) %s' % (experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair) if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair) if paired_files and pe_jobs: outstrings.append( 'paired:%s' % ([(a.get('accession'), b.get('accession')) for (a, b) in paired_files])) outstrings.append('paired jobs:%s' % ([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' % (None)) if unpaired_files and se_jobs: outstrings.append( 'unpaired:%s' % ([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' % ([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' % (None)) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' % experiment.get('accession')) else: logging.warning('%s: No files to map' % experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' % experiment.get('accession'))
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile,'r') as fh: experiments.extend([e for e in fh]) for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print "Experiment %s" %(exp_id) experiment_url = server + '/experiments/%s/' %(exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print "%s %s %s" %(experiment['accession'], target.get('investigated_as'), experiment.get('description')) # ctl_id = get_control_id(experiment) # if ctl_id: # print "Control %s" %(ctl_id) # else: # print "Found no control ... skipping %s" %(exp_id) # continue # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf) # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf) tas = get_tas(experiment, server, keypair, args.project, args.inf) if not tas: logging.error('Failed to resolve all tagaligns for %s' %(experiment['accession'])) continue pprint.pprint(tas) # sys.exit() #continue skip_flag = False for key,value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' %(key)) skip_flag = True if skip_flag: continue workflow_title = '%s Peaks' %(exp_id) if args.tag: workflow_title += ' %s' %(args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/'+outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' %(exp_id) try: investigated_as = target['investigated_as'] except: print "%s: Failed to determine target type ... skipping" %(exp_id) continue else: print investigated_as rep1_pe = tas['rep1_ta']['paired_end'] rep2_pe = tas['rep2_ta']['paired_end'] if None in [rep1_pe, rep2_pe]: print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if rep1_pe != rep2_pe: print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if any('histone' in target_type for target_type in investigated_as): print "Found to be histone. No blacklist will be used." IDR_default = False workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py' blacklist = None else: print "Assumed to be tf" IDR_default = True workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py' if args.assembly == "hg19": blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz" else: print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" %(args.assembly) blacklist = None run_command = \ '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \ '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \ '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \ '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \ '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes) if blacklist: run_command += ' --blacklist "%s"' %(blacklist) if args.debug: run_command += ' --debug' if args.idr or IDR_default: run_command += ' --idr --idrversion %s' %(args.idrversion) print run_command if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error("%s exited with non-zero code %d" %(workflow_spinner, e.returncode)) else: print "%s workflow created" %(experiment['accession']) logging.debug("patching internal_status to url %s" %(experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status':'processing'}, return_response=True) try: r.raise_for_status() except: logging.error("Tried but failed to update experiment internal_status to processing") logging.error(r.text)
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug("Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' % (exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' % (len(possible_files))) logging.debug('%s' % ([(f.get('folder'), f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$', f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" % (repn)) if repn in repns: logging.warning( "Ignoring additional rep%d bam, using first found" % (repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" % (repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" % (f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' % (repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" % (exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [ common.encoded_get(server + '/files/%s/' % (f), keypair) for f in get_encffs(possible_files[i].get('name')) ] controlled_by = common.flat( [f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list( set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error( "%s: Could not find controlled_by_ta for accessions %s" % (experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning( '%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' % (experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error( '%s: Could not find controlled_by or resolve possible_controls for rep%d' % (experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' % (controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update({ 'rep%d_ta' % (ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } }) return tas
#!/usr/bin/env python2 import common import pprint DEPRECATED_STATUSES = ['deleted', 'revoked', 'replaced'] authid, authpw, server = common.processkey() # server = "https://test.encodedcc.org" # authid = "JQYGP4PB" # authpw = "pfk2f3f3stivzbct" keypair = (authid, authpw) experiments = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=Experiment&' 'award.project=ENCODE', keypair)['@graph'] print "Got %d experiments" % (len(experiments)) all_GRCh38_bams = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=File&' 'file_format=bam&' 'assembly=GRCh38', keypair)['@graph'] print "Got %d bams" % (len(all_GRCh38_bams)) assay_titles = {}
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server,exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) if files: for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' %(biorep_n)) biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1','2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession'))) mate = {} paired_files.append((file_object,mate)) if biorep_files: logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair) if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair) if paired_files and pe_jobs: outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files])) outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' %(None)) if unpaired_files and se_jobs: outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' %(None)) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' %experiment.get('accession')) else: logging.warning('%s: No files to map' %experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_control_ids): # Build a list of the possible_control experiments possible_control_experiments = [] for uri in experiment.get('possible_controls'): possible_control_experiment = common.encoded_get(server+uri, keypair) target_uri = possible_control_experiment.get('target') # For now only use controls with no target or target "Control" # (i.e. not IgG) if not target_uri or target_uri.split('/')[2].startswith('Control'): possible_control_experiments.append(possible_control_experiment) elif 'control' in target_uri.split('/')[2]: logging.warning( '%s: possible control %s has target %s. Such controls are allowed but deprecated.' % ( experiment.get('accession'), possible_control_experiment.get('accession'), target_uri)) possible_control_experiments.append(possible_control_experiment) else: logging.warning( '%s: possible control %s has target %s, not "Control". Skipping.' % ( experiment.get('accession'), possible_control_experiment.get('accession'), target_uri)) logging.debug(pformat(possible_control_experiments)) try: matching_ta = \ next(ta for ta in [get_rep_ta(e, repn, default_project, ta_folders) for e in possible_control_experiments] if ta and ta['id'] not in used_control_ids) except StopIteration: logging.warning('Failed to find control rep with matching repn') matching_ta = None except: raise else: return matching_ta tas = [] for e in possible_control_experiments: unused_tas = [ta for ta in get_all_tas(e, default_project, ta_folders) if ta and ta['id'] not in used_control_ids] logging.debug( 'get_possible_ctl_ta: experiment %s unused_tas %s' % (e.get('accession'), unused_tas)) if unused_tas: tas.extend(unused_tas) if len(tas) > 1: logging.warning( 'Found multiple tas %s, returning first one' % ([ta.get('project') + ':' + ta.get('folder') + '/' + ta.get('name') for ta in tas])) if tas: return tas[0] else: logging.warning('Failed to find any possible controls that have not already been used') tas = [] for e in possible_control_experiments: all_tas = [ta for ta in get_all_tas(e, default_project, ta_folders) if ta] logging.debug( 'get_possible_ctl_ta: experiment %s all_tas %s' % (e.get('accession'), all_tas)) if all_tas: tas.extend(all_tas) if len(tas) > 1: logging.warning( 'Found multiple tas %s, returning first one' % ([ta.get('project') + ':' + ta.get('folder') + '/' + ta.get('name') for ta in tas])) if tas: return tas[0] else: logging.error('Failed to find any possible control_tas') return None