def main(outfn, assembly, debug, key, keyfile, dryrun, force, analysis_ids=None, infile=None, project=None): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if infile is not None: infile = dxpy.DXFile(infile) dxpy.download_dxfile(infile.get_id(), "infile") ids = open("infile",'r') elif analysis_ids is not None: ids = analysis_ids else: logger.error("Must supply one of --infile or a list of one or more analysis-ids") return authid, authpw, server = common.processkey(key, keyfile) keypair = (authid,authpw) for (i, analysis_id) in enumerate(ids): logger.info('%s' %(analysis_id)) accessioned_files = accession_analysis(analysis_id, keypair, server, assembly, dryrun, force) print accessioned_files common.touch(outfn) outfile = dxpy.upload_local_file(outfn) output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def s3cp(accession, key=None): (AUTHID, AUTHPW, SERVER) = common.processkey(key, KEYFILE) keypair = (AUTHID, AUTHPW) url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' % ( accession) #get the file object response = common.encoded_get(url, keypair) logger.debug(response) #select your file result = response.get('@graph') if not result: logger.error('Failed to find %s at %s' % (accession, url)) return None else: f_obj = result[0] logger.debug(f_obj) #make the URL that will get redirected - get it from the file object's href property encode_url = urlparse.urljoin(SERVER, f_obj.get('href')) logger.debug("URL: %s" % (encode_url)) logger.debug("%s:%s" % (AUTHID, AUTHPW)) #stream=True avoids actually downloading the file, but it evaluates the redirection r = requests.get(encode_url, auth=(AUTHID, AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True) try: r.raise_for_status except: logger.error('%s href does not resolve' % (f_obj.get('accession'))) logger.debug("Response: %s", (r)) #this is the actual S3 https URL after redirection s3_url = r.url logger.debug(s3_url) #release the connection r.close() #split up the url into components o = urlparse.urlparse(s3_url) #pull out the filename filename = os.path.basename(o.path) #hack together the s3 cp url (with the s3 method instead of https) bucket_url = S3_SERVER.rstrip('/') + o.path #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' % (filename))) dx_file = dxpy.upload_local_file(filename) return dx_file
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids else: ids = args.infile formats = ['bed_narrowPeak', 'bed_gappedPeak'] fieldnames = ['file','analysis','experiment','replicates','output_name','file_format','output_type','target','biosample_term_name','biosample_term_id','biosample_type','biosample_life_stage','biosample_age','biosample_organism'] writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t') writer.writeheader() for (i, analysis_id) in enumerate(ids): analysis_id = analysis_id.rstrip() logger.info('%s' %(analysis_id)) try: files = analysis_files(analysis_id, keypair, server, args.assembly) except: logger.error('%s error finding analysis_files. Check experiment metadata.' %(analysis_id)) for f in [f_obj for f_obj in files if f_obj.get('file_format') in formats]: fid = f['dx'].get_id() local_path = os.path.join(args.outdir,fid) if not os.path.isfile(local_path): if not os.path.exists(args.outdir): os.makedirs(args.outdir) dxpy.download_dxfile(fid, local_path) replicates = [] for derived_from in f['derived_from']: rep_ns = common.biorep_ns(derived_from, server, keypair) for r in rep_ns: replicates.append(r) experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(f['dataset'])), keypair) rep = common.encoded_get(urlparse.urljoin(server, experiment['replicates'][0]), keypair) lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair) biosample = common.encoded_get(urlparse.urljoin(server, lib['biosample']), keypair) writer.writerow({ 'file': fid, 'analysis': analysis_id, 'experiment': experiment.get('accession'), 'replicates': replicates, 'output_name': f.get('name'), 'file_format': f.get('file_format'), 'output_type': f.get('output_type'), 'target': experiment.get('target'), 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_term_id': experiment.get('biosample_term_id'), 'biosample_type': experiment.get('biosample_type'), 'biosample_life_stage': biosample.get('life_stage'), 'biosample_age': biosample.get('age'), 'biosample_organism': biosample.get('organism')})
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.rstrip() logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id)) r = requests.get(url, auth=keypair) try: r.raise_for_status() except: logger.error('%s failed to get metadata. GET returned %s' %(exp_id, r.return_code)) logger.debug('%s' %(r.text)) logger.error('Skipping ...') continue reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t') fieldnames = copy.copy(reader.fieldnames) fieldnames.remove('Biological replicate(s)') fieldnames.insert(4,'Biological replicate(s)') fieldnames.remove('Biosample Age') fieldnames.insert(10,'Biosample Age') fieldnames.append('Derived from') writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t') writer.writeheader() for file_metadata in reader: file_accession = file_metadata.get('File accession') url = urlparse.urljoin(server, 'files/%s' %(file_accession)) file_object = common.encoded_get(url, keypair) bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair)))) file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps]) bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or "" file_metadata.update({'Biosample Age': ",".join(bio_ages)}) if file_object.get('derived_from'): derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')]) else: derived_from = None file_metadata.update({'Derived from': derived_from}) #print file_metadata writer.writerow(file_metadata)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.rstrip() logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id)) r = requests.get(url, auth=keypair) try: r.raise_for_status() except: logger.error('%s failed to get metadata. GET returned %s' %(exp_id, r.return_code)) logger.debug('%s' %(r.text)) logger.error('Skipping ...') continue reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t') fieldnames = copy.copy(reader.fieldnames) # fieldnames.remove('Biological replicate(s)') # fieldnames.insert(4,'Biological replicate(s)') # fieldnames.remove('Biosample Age') # fieldnames.insert(10,'Biosample Age') fieldnames.append('Derived from') writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t') writer.writeheader() for file_metadata in reader: file_accession = file_metadata.get('File accession') url = urlparse.urljoin(server, 'files/%s' %(file_accession)) file_object = common.encoded_get(url, keypair) # bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair)))) # file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps]) # bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or "" # file_metadata.update({'Biosample Age': ",".join(bio_ages)}) if file_object.get('derived_from'): derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')]) else: derived_from = None file_metadata.update({'Derived from': derived_from}) #print file_metadata writer.writerow(file_metadata)
def main(**kwargs): dxpy.download_folder(DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if 'key' in kwargs: key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')]) else: key = dxpy.api.system_whoami()['id'] key_tuple = common.processkey(key, KEYFILE) if not key_tuple: logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE)) raise PortalCredentialsError("Supply a valid keypair ID") authid, authpw, server = key_tuple if 'url' in kwargs: server = kwargs.pop('url') keypair = (authid, authpw) tokens = ['python3 checkfiles.py'] for k, v in kwargs.iteritems(): if isinstance(v, bool): if v: tokens.append("--" + k.replace('_', '-')) continue if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int): tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)])) if 'dx_file' in kwargs: dxfile = dxpy.DXFile(kwargs.get('dx_file')) local_file = dxpy.download_dxfile(dxfile, dxfile.name) tokens.append("--local-file %s" % (dxfile.name)) # this is just to get a command string to print that has no secrets tokens_safe = deepcopy(tokens) tokens_safe.append("--username %s --password %s" % ("." * len(authid), "." * len(authpw))) tokens_safe.append(server) logger.info(' '.join(tokens_safe)) tokens.append("--username %s --password %s" % (authid, authpw)) # this needs to be the last token tokens.append(server) checkfiles_command = ' '.join(tokens) subprocess.check_call(shlex.split(checkfiles_command)) output = {} outfilename = kwargs.get('out') errfilename = kwargs.get('err') if outfilename: out = dxpy.upload_local_file(outfilename) output.update({'out': dxpy.dxlink(out)}) if errfilename: err = dxpy.upload_local_file(errfilename) output.update({'err': dxpy.dxlink(err)}) return output
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) project = resolve_project(args.project) SRR_files = dxpy.find_data_objects( name="SRR???????_?.fastq.gz", name_mode='glob', classname='file', recurse=True, return_handler=True, folder=args.folder, project=args.project) for srr_dxfile in SRR_files: m = re.search('(SRR.{7})_(\d)', srr_dxfile.name) if m: srr_basename = m.group(1) end_num = m.group(2) else: assert m srr_encfiles = common.encoded_get('/'.join([server,'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename)]), keypair)['@graph'] if not srr_encfiles: logging.error('%s object not found at ENCODE. Skipping.' % (srr_basename)) continue elif len(srr_encfiles) > 1: logging.error('%s multiple matching objects found at ENCODE. Skipping.' % (srr_basename)) continue else: srr_encfile = srr_encfiles[0] # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair) # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair) # biorep_n = replicate.get('biological_replicate_number') all_fastqs = common.encoded_get('/'.join([ server, 'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename) ]), keypair)['@graph'] if not all_fastqs: print("%s: no fastq(s) found. Skipping." % (srr_dxfile.name)) continue if end_num == '1': fastqs = [f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num] elif end_num in ['2', '3']: fastqs = [f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2'] if not fastqs: print("%s: no fastq(s) found for paired_end %s. Skipping" % (srr_basename, end_num)) continue elif len(fastqs) > 1: print("%s: ambiguous matches to %s. Skipping" % (srr_basename, [f.get('accession') for f in fastqs])) continue else: fastq = fastqs[0] newname = '%s.fastq.gz' % (fastq.get('accession')) if args.dry_run: print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname)) else: srr_dxfile.set_properties({'srr_filename': srr_dxfile.name}) srr_dxfile.rename(newname) print('%s renamed to %s' % (srr_dxfile.name, newname))
def s3_dxcp(accession, key=None): (AUTHID,AUTHPW,SERVER) = common.processkey(key,KEYFILE) keypair = (AUTHID,AUTHPW) url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' %(accession) #get the file object response = common.encoded_get(url, keypair) logger.debug(response) #select your file result = response.get('@graph') if not result: logger.error('Failed to find %s at %s' %(accession, url)) return None else: f_obj = result[0] logger.debug(f_obj) #make the URL that will get redirected - get it from the file object's href property encode_url = urlparse.urljoin(SERVER,f_obj.get('href')) logger.debug("URL: %s" %(encode_url)) logger.debug("%s:%s" %(AUTHID, AUTHPW)) #stream=True avoids actually downloading the file, but it evaluates the redirection r = requests.get(encode_url, auth=(AUTHID,AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True) try: r.raise_for_status except: logger.error('%s href does not resolve' %(f_obj.get('accession'))) logger.debug("Response: %s", (r)) #this is the actual S3 https URL after redirection s3_url = r.url logger.debug(s3_url) #release the connection r.close() #split up the url into components o = urlparse.urlparse(s3_url) #pull out the filename filename = os.path.basename(o.path) #hack together the s3 cp url (with the s3 method instead of https) bucket_url = S3_SERVER.rstrip('/') + o.path #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' %(filename))) dx_file = dxpy.upload_local_file(filename) return dx_file
def main(): args = get_args() logging.basicConfig(level=args.log_level) authid, authpw, base_url = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiment_data, file_data = get_experiments_and_files( base_url, keypair, args.report_type, args.assembly) references_data = get_references_data(base_url, keypair, args.report_type) build_rows = get_row_builder(args.report_type) rows = build_rows(experiment_data, file_data, references_data, args.report_type, base_url, args) df = pd.DataFrame(rows) df = format_dataframe(df, args.report_type, base_url, args.output_type) outputter = get_outputter(args.output_type) outputter(df, args)
def main(outfn, assembly, debug, key, keyfile, dryrun, force, analysis_ids=None, infile=None, project=None): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if infile is not None: infile = dxpy.DXFile(infile) dxpy.download_dxfile(infile.get_id(), "infile") ids = open("infile", 'r') elif analysis_ids is not None: ids = analysis_ids else: logger.error( "Must supply one of --infile or a list of one or more analysis-ids" ) return authid, authpw, server = common.processkey(key, keyfile) keypair = (authid, authpw) for (i, analysis_id) in enumerate(ids): logger.info('%s' % (analysis_id)) accessioned_files = accession_analysis(analysis_id, keypair, server, assembly, dryrun, force) print accessioned_files common.touch(outfn) outfile = dxpy.upload_local_file(outfn) output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids else: ids = args.infile for (i, analysis_id) in enumerate(ids): logger.info('%s' %(analysis_id)) accessioned_files = accession_analysis(analysis_id, keypair, server, args.assembly, args.dryrun, args.force)
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.infile and args.experiments: experiments = args.experiments experiments.extend([e.strip() for e in args.infile if e.strip()]) elif args.infile: experiments = args.infile else: experiments = args.experiments for exp_id in experiments: uri = '/experiments/%s' %(exp_id) experiment = common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) if experiment.get('status') == 'error': print experiment print "Error fetching %s ... skipping" %(exp_id) continue print experiment.get('accession') for uri in experiment['original_files']: url = urlparse.urljoin(server,'%s' %(uri)) file_obj = common.encoded_get(url, keypair) print "%s, %s, %s, %s, %s, %s" %(file_obj.get('accession'),file_obj.get('file_type'),file_obj.get('file_format'),file_obj.get('file_format_type'),file_obj.get('output_type'),file_obj.get('status')) if file_obj.get('file_format') in ['bed', 'bigBed', 'bigWig']: if file_obj.get('status') != 'released' or args.force: patch_payload = {'status': args.status} if args.dryrun: print "--dryrun: would have patched %s" %(json.dumps(patch_payload)) else: r = requests.patch(url, auth=keypair, data=json.dumps(patch_payload), headers={'content-type': 'application/json', 'accept': 'application/json'}) try: r.raise_for_status() except: print(r.text) print('Patch failed: %s %s ... skipping' % (r.status_code, r.reason)) continue else: print "Patched %s" %(json.dumps(patch_payload))
def main(reads1, reads2, crop_length, reference_tar, bwa_aln_params, bwa_version, samtools_version, keyfile, debug, key=None): # reads1 and reads2 are expected to be an arrays of file identifiers # indentifiers can be DNAnexus files or ENCODE file accession numbers # For SE, reads2 is empty # For PE, len(reads1) = len(reads2) # Multiple PE pairs or SE files are just catted before mapping # Error on mixed SE/PE - although this can be implemented as just a # "" entry at that position in reads2 array # TODO: Add option to down-sample mixed PE/SE to SE if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # fetch the credentials from the DCC Credentials project dxpy.download_folder( DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if not key or key in ['www', 'submit', 'production']: key = dxpy.api.system_whoami()['id'] elif key == 'test': key = dxpy.api.system_whoami()['id'] + "-test" key_tuple = common.processkey(key, keyfile) assert key_tuple, "ERROR: Key %s is not found in the keyfile %s" % (key, keyfile) authid, authpw, server = key_tuple keypair = (authid, authpw) logger.info("reads1: %s" % (reads1)) logger.info("reads2: %s" % (reads2)) if reads2: paired_end = True assert len(reads1) == len(reads2), "Paired-end and unequal numbers of read1 and read2 identifiers: %s %s" % (reads1, reads2) else: paired_end = False reads1_files = [resolve_file(read, server, keypair) for read in reads1] if paired_end: reads2_files = [resolve_file(read, server, keypair) for read in reads2] else: reads2_files = [] # pooling multiple fastqs if len(reads1_files) > 1: reads1_file = pooled(reads1_files) else: reads1_file = reads1_files[0] if len(reads2_files) > 1: reads2_file = pooled(reads2_files) elif len(reads2_files) == 1: reads2_file = reads2_files[0] else: reads2_file = None reference_tar_file = resolve_file(reference_tar, server, keypair) logger.info('Resolved reads1 to %s', reads1_file) if reads2_file: logger.info('Resolved reads2 to %s', reads2_file) logger.info('Resolved reference_tar to %s', reference_tar_file) output = { "reads1": reads1_file, "reference_tar": reference_tar_file, "crop_length": crop_length, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "samtools_version": samtools_version, "debug": debug } if reads2_file: output.update({"reads2": reads2_file}) logger.info('Exiting with output: %s' % (output)) return output
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # use the defaulf logging level logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: ids = args.experiments # elif args.created_after: # analyses = [] # for state in args.state: # analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) # ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.all: exp_query = \ "/search/?type=Experiment" + \ "&assay_title=ChIP-seq" + \ "&award.project=ENCODE" + \ "&status=released&status=submitted&status=in+progress&status=started&status=release+ready" all_experiments = common.encoded_get(server+exp_query, keypair)['@graph'] ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply experiment id's in arguments or --infile") fieldnames = [ 'date','analysis','analysis id','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','release','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() idr_query = \ "/search/?type=File" + \ "&assembly=%s" % (args.assembly) + \ "&file_format=bed" + \ "&output_type=optimal+idr+thresholded+peaks" + \ "&output_type=conservative+idr+thresholded+peaks" + \ "&lab.title=ENCODE+Processing+Pipeline" + \ "&lab.title=J.+Michael+Cherry,+Stanford" + \ "&status=in+progress&status=released&status=uploading&status=uploaded" all_idr_files = common.encoded_get(server+idr_query, keypair)['@graph'] for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning( "%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s. Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error( '%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error( '%s: Expected one unique IDR metric, found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error( '%s: Expected one unique assembly, found %d. Skipping.' % (experiment_id, len(assemblies))) continue assembly = next(iter(assemblies)) idr_step_run_uri = next(iter(idr_step_runs)) idr_step_run = common.encoded_get(server+idr_step_run_uri, keypair) try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get('dx_job_id') except: logger.warning("Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id") logger.debug(idr_step_run) dx_job_id_str = None #could try to pull it from alias dx_job_id = dx_job_id_str.rpartition(':')[2] dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue if args.all: # we've already gotten all the experiment objects experiment = \ next(e for e in all_experiments if e['accession'] == experiment_accession) else: experiment = \ common.encoded_get(urlparse.urljoin( server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logger.error('Failed to find final IDR stage in %s' %(analysis_id)) else: if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'] for stage_name in idr_stage_names: try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r'Peak files must contain at least 20 peaks post-merge'] for p in patterns: m = re.search(p,job_log) if m: notes.append("%s: %s" %(stage_name,m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') notes = "IDR Complete" try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") except StopIteration: done_time = None except: raise if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = '%sexperiments/%s' %(server, experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'analysis id': desc.get('id'), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': assembly, 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'release': experiment['status'], 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server,exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) if files: for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' %(biorep_n)) biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1','2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession'))) mate = {} paired_files.append((file_object,mate)) if biorep_files: logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair) if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair) if paired_files and pe_jobs: outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files])) outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' %(None)) if unpaired_files and se_jobs: outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' %(None)) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' %experiment.get('accession')) else: logging.warning('%s: No files to map' %experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' %experiment.get('accession'))
#!/usr/bin/env python2 import common import pprint DEPRECATED_STATUSES = ['deleted', 'revoked', 'replaced'] authid, authpw, server = common.processkey() # server = "https://test.encodedcc.org" # authid = "JQYGP4PB" # authpw = "pfk2f3f3stivzbct" keypair = (authid, authpw) experiments = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=Experiment&' 'award.project=ENCODE', keypair)['@graph'] print "Got %d experiments" % (len(experiments)) all_GRCh38_bams = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=File&' 'file_format=bam&' 'assembly=GRCh38', keypair)['@graph'] print "Got %d bams" % (len(all_GRCh38_bams)) assay_titles = {} for exp in experiments:
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.query: r = requests.get(args.query, auth=keypair, headers={'content-type': 'application/json', 'accept': 'application/json'}) experiments = r.json()['@graph'] exp_ids = [e['accession'] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile logger.info('Checking %d experiments' % (len(exp_ids))) for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() #logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, '/experiments/%s' %(exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment_object.get('original_files')] bams = [f for f in original_files if f.get('file_format') == 'bam' and f.get('status') not in ['revoked','deleted','replaced']] fastqs = [f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') not in ['revoked','deleted','replaced']] for f in fastqs: f['replicate'] = common.encoded_get(urlparse.urljoin(server,'%s' %(f.get('replicate'))), keypair) for bam in bams: bioreps = common.biorep_ns(bam.get('accession'),server,keypair) if len(bioreps) != 1: logger.error("Expected to find 1 biorep for bam %s, found %s. Skipping." %(bam.get('accession'), bioreps)) continue else: bam_biorep = bioreps[0] try: derived_from = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in bam.get('derived_from')] except: derived_from = None if not derived_from: logger.error('bam %s is derived from nothing. Skipping' %(bam.get('accession'))) continue for f in derived_from: if f.get('output_category') == 'reference': continue if f.get('file_format') != 'fastq': logger.error("bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." %(bam.get('accession'), f.get('accession'))) continue try: if common.after(f.get('date_created'), bam.get('date_created')): logger.error("Date conflict. Bam %s is derived from newer Fastq %s" %(bam.get('accession'), f.get('accession'))) except: logger.error("Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." %(bam.get('date_created'), f.get('date_created'))) continue for f in fastqs: if f.get('replicate').get('biological_replicate_number') == bam_biorep: if common.after(f.get('date_created'), bam.get('date_created')): logger.info("bam %s is out-of-date. fastq %s is newer" %(bam.get('accession'), f.get('accession'))) if re.search('control',experiment_object.get('target').lower()): logger.info("WARNING, %s is a control experiment so many other experiments may be out-of-date." %(experiment_object.get('accession')))
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) if args.control: control_dxhandler = resolve_dx_file(args.control) else: control_dxhandler = None for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print("Experiment %s" % (exp_id)) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print("%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description'))) tas = get_tas(experiment, server, keypair, args.project, args.inf, control_dxhandler) if not tas: logging.error('Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue if not tas.get('rep2_ta'): simplicate_experiment = True print("Simplicate experiment ta's:") else: simplicate_experiment = False print("Replicated experiment ta's:") pprint(tas) # sys.exit() # continue for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/' + outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: logging.error("%s: Failed to determine target type ... skipping" % (exp_id)) continue else: print(investigated_as) rep1_pe = tas['rep1_ta']['paired_end'] if not simplicate_experiment: rep2_pe = tas['rep2_ta']['paired_end'] else: rep2_pe = None if simplicate_experiment and rep1_pe is None: logging.error( "%s: Cannot determine paired end: rep1 PE = %s... skipping" % (exp_id, rep1_pe)) continue elif not simplicate_experiment and None in [rep1_pe, rep2_pe]: logging.error( "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if not simplicate_experiment and rep1_pe != rep2_pe: logging.error( "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if any('histone' in target_type for target_type in investigated_as): logging.info( "%s: Found to be histone. No blacklist will be used." % (exp_id)) wf_target = 'histone' blacklist = None else: logging.info("Assumed to be tf") wf_target = 'tf' if not args.blacklist: if args.assembly in ASSEMBLY_METADATA: blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist'] else: logging.warning( "%s: No blacklist for assembly %s, proceeding with no blacklist" % (exp_id, args.assembly)) blacklist = None if not args.gsize: if args.assembly in ASSEMBLY_METADATA: genomesize = ASSEMBLY_METADATA[args.assembly]['gsize'] else: logging.error("%s: Must specify -gsize for assembly %s" % (exp_id, args.assembly)) else: genomesize = args.gsize if not args.csizes: if args.assembly in ASSEMBLY_METADATA: chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes'] else: logging.error("%s: Must specify -csizes for assembly %s" % (exp_id, args.assembly)) else: chrom_sizes = args.csizes chip_workflow_absolute_path = os.path.dirname( os.path.realpath(__file__)) + "/chip_workflow.py" command_strings = [ chip_workflow_absolute_path, '--nomap --yes', '--target %s' % (wf_target), '--title "%s"' % (workflow_title), '--outf "%s"' % (outf), '--rep1pe %s' % (str(rep1_pe).lower()), '--rep1 %s' % (tas['rep1_ta'].get('file_id')), '--ctl1 %s' % (tas['rep1_ta'].get('control_id')), '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes), '--spp_version %s' % (args.spp_version) ] if not simplicate_experiment: command_strings.extend([ '--rep2pe %s' % (str(rep2_pe).lower()), '--rep2 %s' % (tas['rep2_ta'].get('file_id')), '--ctl2 %s' % (tas['rep2_ta'].get('control_id')), ]) if args.fragment_length: command_strings.append('--fragment_length %s' % str(args.fragment_length)) if blacklist: command_strings.append('--blacklist "%s"' % (blacklist)) if args.debug: command_strings.append('--debug') if args.use_existing_folders: command_strings.append('--use_existing_folders') if args.accession: command_strings.append('--accession') if args.fqcheck is not None: command_strings.append('--fqcheck=%s' % (args.fqcheck)) if args.skip_control is not None: command_strings.append('--skip_control=%s' % (args.skip_control)) if args.force_patch is not None: command_strings.append('--force_patch=%s' % (args.force_patch)) run_command = ' '.join(command_strings) print(run_command) if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error( "%s: chip_workflow exited with non-zero code %d" % (exp_id, e.returncode)) else: print("%s workflow created" % (experiment['accession'])) logging.debug("%s: patching internal_status to url %s" % (exp_id, experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.warning( "%s: Failed to update experiment internal_status to processing. Skipping that update." % (exp_id)) logging.debug(r.text)
#!/usr/bin/env python2 import common import pprint DEPRECATED_STATUSES = ['deleted', 'revoked', 'replaced'] authid, authpw, server = common.processkey() # server = "https://test.encodedcc.org" # authid = "JQYGP4PB" # authpw = "pfk2f3f3stivzbct" keypair = (authid, authpw) experiments = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=Experiment&' 'award.project=ENCODE', keypair)['@graph'] print "Got %d experiments" % (len(experiments)) all_GRCh38_bams = common.encoded_get( 'https://www.encodeproject.org/search/?' 'type=File&' 'file_format=bam&' 'assembly=GRCh38', keypair)['@graph'] print "Got %d bams" % (len(all_GRCh38_bams)) assay_titles = {}
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # Use the default logging level. logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) if args.released: keypair = None server = PUBLIC_SERVER else: authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments elif args.all: # Get metadata for all ChIP-seq Experiments. base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released' extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready' exp_query = base_exp_query if args.released else (base_exp_query + extended_query) all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] # Extract Experiment accessions. ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: # Never reached because infile defaults to stdin. raise InputError('Must supply experiment ids' ' in arguments or --infile.') # Define column names for TSV. fieldnames = [ 'date', 'analysis', 'analysis_id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1', 'F2', 'state', 'release', 'total_price', 'quality_metric_of' ] if args.create_google_sheet: # Force creation of temporary CSV that can be loaded into a DataFrame, # written to Google Sheets, then deleted. temp_file = 'temp_idr_%s.tsv' % (args.assembly) args.outfile = open(temp_file, 'w') writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() # Get metadata for all IDR output Files. base_idr_query = ('/search/?type=File&assembly=%s&file_format=bed' '&output_type=optimal+idr+thresholded+peaks' '&output_type=conservative+idr+thresholded+peaks' '&output_type=pseudoreplicated+idr+thresholded+peaks' '&lab.title=ENCODE+Processing+Pipeline' '&lab.title=J.+Michael+Cherry,+Stanford' '&status=released' % (args.assembly)) extended_idr_query = '&status=in+progress&status=uploading&status=uploaded' idr_query = base_idr_query if args.released else (base_idr_query + extended_idr_query) all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] na = 'not_available' for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning("%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s.' ' Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error('%s: Expected one unique IDR metric,' ' found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error('%s: Expected one unique assembly, found %d.' ' Skipping.' % (experiment_id, len(assemblies))) continue # Grab unique value from set. idr_qc_uri = next(iter(idr_qc_uris)) assembly = next(iter(assemblies)) # Get analysis_id from DNAnexus, create analysis_link. idr_step_run_uri = next(iter(idr_step_runs)) try: idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) except Exception as e: print(experiment_id, e, 'Skipping.') continue try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get( 'dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id" ) logger.debug(idr_step_run) # Could try to pull it from alias. dx_job_id_str = None dx_job_id = dx_job_id_str.rpartition(':')[2] if not args.released: dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) else: analysis_link = na desc = {} # Get IDR object. idr = common.encoded_get(server + idr_qc_uri, keypair) # Pull metrics of interest. idr_status = idr.get('status', na) if (args.released and (idr_status == na or idr_status != 'released')): logger.error('%s: Expected released IDR metric. Skipping.' % idr_qc_uris) continue Np = idr.get('Np', na) N1 = idr.get('N1', na) N2 = idr.get('N2', na) Nt = idr.get('Nt', na) Fp = idr.get('Fp', na) F1 = idr.get('F1', na) F2 = idr.get('F2', na) Ft = idr.get('Ft', na) quality_metric_of = idr.get('quality_metric_of', []) date = idr.get('date_created', na) rescue_ratio = idr.get('rescue_ratio', na) self_consistency_ratio = idr.get('self_consistency_ratio', na) reproducibility_test = idr.get('reproducibility_test', na) # Get Experiment object. experiment = common.encoded_get(server + experiment_id, keypair) experiment_link = '%sexperiments/%s' % (server, experiment.get('accession')) # Get Award object. award = common.encoded_get(server + experiment.get('award'), keypair) # Grab project phase, e.g. ENCODE4. rfa = award.get('rfa', na) row = { 'date': date, 'analysis': analysis_link, 'analysis_id': desc.get('id', na), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'replication': experiment.get('replication_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': rfa, 'assembly': assembly, 'Nt': Nt, 'Np': Np, 'N1': N1, 'N2': N2, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'Ft': Ft, 'Fp': Fp, 'F1': F1, 'F2': F2, 'state': desc.get('state', na), 'release': experiment['status'], 'total_price': desc.get('totalPrice', na), 'quality_metric_of': ', '.join(quality_metric_of) } writer.writerow(row) if args.create_google_sheet: args.outfile.close() # Load CSV data, sort. idr_data = pd.read_table(temp_file) idr_data = idr_data.replace('not_available', '') idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x)) idr_data = idr_data.sort_values( by=['lab', 'biosample_term_name', 'target', 'experiment'], ascending=[True, True, True, True]) idr_data.date = idr_data.date.astype('str') idr_data = idr_data.reset_index(drop=True) # Read sheet title and create unique page title. date = datetime.now().strftime('%m_%d_%Y') sheet_title = (args.sheet_title if not args.released else '{} Released'.format(args.sheet_title)) page_title = '%s_IDR_FRIP_%s' % (args.assembly, date) # Open/create Google Sheet. gc = pygsheets.authorize(args.apikey) try: sh = gc.open(sheet_title) except pygsheets.exceptions.SpreadsheetNotFound: sh = gc.create(sheet_title) try: wks = sh.add_worksheet(page_title) except HttpError: wks = sh.worksheet_by_title(page_title) # Clear worksheet. wks.clear() # Add data from DataFrame. wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1') # Apply formatting and conditions. header['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, header) # Format numbers. for col in number_format_columns: num = idr_data.columns.get_loc(col) number_format['repeatCell']['range']['startColumnIndex'] = num number_format['repeatCell']['range']['endColumnIndex'] = num + 1 number_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, number_format) # Resize font. font_size_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format) # Add conditional formatting. for conditional in conditions: num = idr_data.columns.get_loc("reproducibility_test") conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'startColumnIndex'] = num conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'endColumnIndex'] = num + 1 conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, conditional) for k, v in notes_dict.items(): num = idr_data.columns.get_loc(k) note['repeatCell']['range']['startColumnIndex'] = num note['repeatCell']['range']['endColumnIndex'] = num + 1 note['repeatCell']['cell']['note'] = v note['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, note) # Optional. Smaller column width to match original. for i in range(wks.cols): wks.adjust_column_width(i, pixel_size=38) # Resize tiny columns. tiny_columns = ['experiment', 'analysis'] for i in [idr_data.columns.get_loc(x) for x in tiny_columns]: wks.adjust_column_width(i, pixel_size=25) # Resize medium columns. medium_columns = ['replication', 'assembly', 'rfa'] for i in [idr_data.columns.get_loc(x) for x in medium_columns]: wks.adjust_column_width(i, pixel_size=65) # Resize wide columns. wide_columns = ['target', 'reproducibility_test', 'lab'] for i in [idr_data.columns.get_loc(x) for x in wide_columns]: wks.adjust_column_width(i, pixel_size=85) # Remove temp file. os.remove(temp_file)
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.experiments: exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server,exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.no_sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) in_process = False if files: for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' %(biorep_n)) biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1','2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession'))) mate = {} # if mapping as SE, ignore the mate and just map the # rep1 as SE with all the other SE for this rep, if any if args.force_se: unpaired_files.append(next( f for f in [file_object, mate] if f.get('paired_end') == '1')) else: paired_files.append((file_object, mate)) if biorep_files: logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair, args.sex_specific) in_process = True if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair, args.sex_specific) in_process = True if paired_files and pe_jobs: outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files])) outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' %(None)) if unpaired_files and se_jobs: outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' %(None)) if in_process: r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True) try: r.raise_for_status() except: logging.error("Tried and failed to set internal_status") logging.error(r.text) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' %experiment.get('accession')) else: logging.warning('%s: No files to map' %experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids else: ids = args.infile formats = ['bed_narrowPeak', 'bed_gappedPeak'] fieldnames = [ 'file', 'analysis', 'experiment', 'replicates', 'output_name', 'file_format', 'output_type', 'target', 'biosample_term_name', 'biosample_term_id', 'biosample_type', 'biosample_life_stage', 'biosample_age', 'biosample_organism' ] writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t') writer.writeheader() for (i, analysis_id) in enumerate(ids): analysis_id = analysis_id.rstrip() logger.info('%s' % (analysis_id)) try: files = analysis_files(analysis_id, keypair, server, args.assembly) except: logger.error( '%s error finding analysis_files. Check experiment metadata.' % (analysis_id)) for f in [ f_obj for f_obj in files if f_obj.get('file_format') in formats ]: fid = f['dx'].get_id() local_path = os.path.join(args.outdir, fid) if not os.path.isfile(local_path): if not os.path.exists(args.outdir): os.makedirs(args.outdir) dxpy.download_dxfile(fid, local_path) replicates = [] for derived_from in f['derived_from']: rep_ns = common.biorep_ns(derived_from, server, keypair) for r in rep_ns: replicates.append(r) experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (f['dataset'])), keypair) rep = common.encoded_get( urlparse.urljoin(server, experiment['replicates'][0]), keypair) lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair) biosample = common.encoded_get( urlparse.urljoin(server, lib['biosample']), keypair) writer.writerow({ 'file': fid, 'analysis': analysis_id, 'experiment': experiment.get('accession'), 'replicates': replicates, 'output_name': f.get('name'), 'file_format': f.get('file_format'), 'output_type': f.get('output_type'), 'target': experiment.get('target'), 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_term_id': experiment.get('biosample_term_id'), 'biosample_type': experiment.get('biosample_type'), 'biosample_life_stage': biosample.get('life_stage'), 'biosample_age': biosample.get('age'), 'biosample_organism': biosample.get('organism') })
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.query: r = requests.get( args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"} ) experiments = r.json()["@graph"] exp_ids = [e["accession"] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() logger.info("%s" % (exp_id)) url = urlparse.urljoin(server, "/experiments/%s" % (exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [ common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair) for uri in experiment_object.get("original_files") ] bams = [ f for f in original_files if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"] ] fastqs = [ f for f in original_files if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"] ] beds = [ f for f in original_files if f.get("file_format") == "bed" and f.get("status") not in ["revoked", "deleted", "replaced"] ] bigBeds = [ f for f in original_files if f.get("file_format") == "bigBed" and f.get("status") not in ["revoked", "deleted", "replaced"] ] for f in beds + bigBeds: notes = json.loads(f.get("notes")) f["job"] = dxpy.describe(notes["dx-createdBy"]["job"]) job = dxpy.describe(notes["dx-createdBy"]["job"]) output_names = [ output_name for output_name, value in job["output"].iteritems() if dxpy.is_dxlink(value) and value["$dnanexus_link"] == notes["dx-id"] ] assert len(output_names) == 1 f["output_name"] = output_names[0] f["dxid"] = notes["dx-id"] for bb in bigBeds: print bb["accession"] notes = json.loads(bb.get("notes")) job = dxpy.describe(notes["dx-createdBy"]["job"]) output_name = bb["output_name"] assert output_name.endswith("_bb") print output_name bed_output_name = output_name.rpartition("_bb")[0] print bed_output_name bed_dxid = job["output"][bed_output_name]["$dnanexus_link"] print bed_dxid possible_beds = [ bed["accession"] for bed in beds if bed.get("notes") and json.loads(bed["notes"])["dx-id"] == bed_dxid ] print possible_beds assert len(possible_beds) == 1 print possible_beds[0] if not args.dryrun: url = urlparse.urljoin(server, "/files/%s/" % (bb["accession"])) payload = {"derived_from": [possible_beds[0]]} print url print payload r = requests.patch( url, auth=keypair, data=json.dumps(payload), headers={"content-type": "application/json", "accept": "application/json"}, ) try: r.raise_for_status() except: print r.text overlapping_peaks_beds = [b for b in beds if b.get("output_name") == "overlapping_peaks"] assert len(overlapping_peaks_beds) == 1 overlapping_peaks_bed = overlapping_peaks_beds[0] job = overlapping_peaks_bed["job"] derived_from_dxids = [ job["input"][input_name]["$dnanexus_link"] for input_name in job["input"].keys() if input_name in ["rep1_peaks", "rep2_peaks", "pooled_peaks"] ] print derived_from_dxids derived_from_accessions = [bed["accession"] for bed in beds if bed["dxid"] in derived_from_dxids] print derived_from_accessions if not args.dryrun: url = urlparse.urljoin(server, "/files/%s/" % (overlapping_peaks_bed["accession"])) payload = {"derived_from": derived_from_accessions} print url print payload r = requests.patch( url, auth=keypair, data=json.dumps(payload), headers={"content-type": "application/json", "accept": "application/json"}, ) try: r.raise_for_status() except: print r.text
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) project = resolve_project(args.project) SRR_files = dxpy.find_data_objects(name="SRR???????_?.fastq.gz", name_mode='glob', classname='file', recurse=True, return_handler=True, folder=args.folder, project=args.project) for srr_dxfile in SRR_files: m = re.search('(SRR.{7})_(\d)', srr_dxfile.name) if m: srr_basename = m.group(1) end_num = m.group(2) else: assert m srr_encfiles = common.encoded_get( '/'.join([ server, 'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename) ]), keypair)['@graph'] if not srr_encfiles: logging.error('%s object not found at ENCODE. Skipping.' % (srr_basename)) continue elif len(srr_encfiles) > 1: logging.error( '%s multiple matching objects found at ENCODE. Skipping.' % (srr_basename)) continue else: srr_encfile = srr_encfiles[0] # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair) # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair) # biorep_n = replicate.get('biological_replicate_number') all_fastqs = common.encoded_get( '/'.join([ server, 'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename) ]), keypair)['@graph'] if not all_fastqs: print("%s: no fastq(s) found. Skipping." % (srr_dxfile.name)) continue if end_num == '1': fastqs = [ f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num ] elif end_num in ['2', '3']: fastqs = [ f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2' ] if not fastqs: print("%s: no fastq(s) found for paired_end %s. Skipping" % (srr_basename, end_num)) continue elif len(fastqs) > 1: print("%s: ambiguous matches to %s. Skipping" % (srr_basename, [f.get('accession') for f in fastqs])) continue else: fastq = fastqs[0] newname = '%s.fastq.gz' % (fastq.get('accession')) if args.dry_run: print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname)) else: srr_dxfile.set_properties({'srr_filename': srr_dxfile.name}) srr_dxfile.rename(newname) print('%s renamed to %s' % (srr_dxfile.name, newname))
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments # elif args.created_after: # analyses = [] # for state in args.state: # analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) # ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.all: exp_query = \ "/search/?type=Experiment" + \ "&assay_title=ChIP-seq" + \ "&award.project=ENCODE" + \ "&status=released&status=submitted&status=in+progress&status=started&status=release+ready" all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply experiment id's in arguments or --infile") fieldnames = [ 'date', 'analysis', 'analysis id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'release', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() idr_query = \ "/search/?type=File" + \ "&file_format=bed" + \ "&output_type=optimal+idr+thresholded+peaks" + \ "&output_type=conservative+idr+thresholded+peaks" + \ "&lab.title=ENCODE+Processing+Pipeline" + \ "&lab.title=J.+Michael+Cherry,+Stanford" + \ "&status=in+progress&status=released&status=uploading&status=uploaded" all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs) == 1: if not args.all: logger.warning( "%s: Expected one IDR step run. Found %d. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s. Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error( '%s: Expected one unique IDR metric, found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error( '%s: Expected one unique assembly, found %d. Skipping.' % (experiment_id, len(assemblies))) continue assembly = next(iter(assemblies)) idr_step_run_uri = next(iter(idr_step_runs)) idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get( 'dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id" ) logger.debug(idr_step_run) dx_job_id_str = None #could try to pull it from alias dx_job_id = dx_job_id_str.rpartition(':')[2] dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue if args.all: # we've already gotten all the experiment objects experiment = \ next(e for e in all_experiments if e['accession'] == experiment_accession) else: experiment = \ common.encoded_get(urlparse.urljoin( server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'analysis id': desc.get('id'), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': assembly, 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'release': experiment['status'], 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(reads1, reads2, crop_length, reference_tar, bwa_aln_params, bwa_version, samtools_version, keyfile, debug, key=None): # reads1 and reads2 are expected to be an arrays of file identifiers # indentifiers can be DNAnexus files or ENCODE file accession numbers # For SE, reads2 is empty # For PE, len(reads1) = len(reads2) # Multiple PE pairs or SE files are just catted before mapping # Error on mixed SE/PE - although this can be implemented as just a # "" entry at that position in reads2 array # TODO: Add option to down-sample mixed PE/SE to SE if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # fetch the credentials from the DCC Credentials project dxpy.download_folder(DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if not key or key in ['www', 'submit', 'production']: key = dxpy.api.system_whoami()['id'] elif key == 'test': key = dxpy.api.system_whoami()['id'] + "-test" key_tuple = common.processkey(key, keyfile) assert key_tuple, "ERROR: Key %s is not found in the keyfile %s" % ( key, keyfile) authid, authpw, server = key_tuple keypair = (authid, authpw) logger.info("reads1: %s" % (reads1)) logger.info("reads2: %s" % (reads2)) if reads2: paired_end = True assert len(reads1) == len( reads2 ), "Paired-end and unequal numbers of read1 and read2 identifiers: %s %s" % ( reads1, reads2) else: paired_end = False reads1_files = [resolve_file(read, server, keypair) for read in reads1] if paired_end: reads2_files = [resolve_file(read, server, keypair) for read in reads2] else: reads2_files = [] # pooling multiple fastqs if len(reads1_files) > 1: reads1_file = pooled(reads1_files) else: reads1_file = reads1_files[0] if len(reads2_files) > 1: reads2_file = pooled(reads2_files) elif len(reads2_files) == 1: reads2_file = reads2_files[0] else: reads2_file = None reference_tar_file = resolve_file(reference_tar, server, keypair) logger.info('Resolved reads1 to %s', reads1_file) if reads2_file: logger.info('Resolved reads2 to %s', reads2_file) logger.info('Resolved reference_tar to %s', reference_tar_file) output = { "reads1": reads1_file, "reference_tar": reference_tar_file, "crop_length": crop_length, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "samtools_version": samtools_version, "debug": debug } if reads2_file: output.update({"reads2": reads2_file}) logger.info('Exiting with output: %s' % (output)) return output
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = dxpy.find_analyses(name="ENCSR*",name_mode='glob',state='done',include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)) ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq'] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ 'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." %(desc['name'])) continue experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' %(experiment['accession'])) try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find IDR stage in %s' %(analysis_id)) else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") row = { 'date': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)), 'analysis': analysis.get_id(), 'experiment': experiment.get('accession'), 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } notes = [] # if int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out')) != int(np_stage.get('output').get('npeaks_rejected')): # notes.append("in-out!=rej delta=%i" %(int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out')))) # else: # notes.append("in-out=rej OK") # bb_check_notes = [] # for stage in [np_stage, gp_stage]: # bb_dxf = dxpy.DXFile(stage['output']['overlapping_peaks_bb']) # if int(bb_dxf.describe()['size']) < 200000: # bb_check_notes.append("%s bb size=%i" %(stage['name'], int(bb_dxf.describe()['size']))) # if not bb_check_notes: # notes.append("bb check OK") # else: # notes.append(bb_check_notes) if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # Use the default logging level. logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) if args.released: keypair = None server = PUBLIC_SERVER else: authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments elif args.all: # Get metadata for all ChIP-seq Experiments. base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released' extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready' exp_query = base_exp_query if args.released else (base_exp_query + extended_query) all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] # Extract Experiment accessions. ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: # Never reached because infile defaults to stdin. raise InputError('Must supply experiment ids' ' in arguments or --infile.') # Define column names for TSV. fieldnames = ['date', 'analysis', 'analysis_id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1', 'F2', 'state', 'release', 'total_price', 'quality_metric_of'] if args.create_google_sheet: # Force creation of temporary CSV that can be loaded into a DataFrame, # written to Google Sheets, then deleted. temp_file = 'temp_idr_%s.tsv' % (args.assembly) args.outfile = open(temp_file, 'w') writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() # Get metadata for all IDR output Files. base_idr_query = ( '/search/?type=File&assembly=%s&file_format=bed' '&output_type=optimal+idr+thresholded+peaks' '&output_type=conservative+idr+thresholded+peaks' '&output_type=pseudoreplicated+idr+thresholded+peaks' '&lab.title=ENCODE+Processing+Pipeline' '&lab.title=J.+Michael+Cherry,+Stanford' '&status=released' % (args.assembly) ) extended_idr_query = '&status=in+progress&status=uploading&status=uploaded' idr_query = base_idr_query if args.released else (base_idr_query + extended_idr_query) all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] na = 'not_available' for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning( "%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error('%s: Expected one IDR quality metric for file %s.' ' Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error('%s: Expected one unique IDR metric,' ' found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error('%s: Expected one unique assembly, found %d.' ' Skipping.' % (experiment_id, len(assemblies))) continue # Grab unique value from set. idr_qc_uri = next(iter(idr_qc_uris)) assembly = next(iter(assemblies)) # Get analysis_id from DNAnexus, create analysis_link. idr_step_run_uri = next(iter(idr_step_runs)) try: idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) except Exception as e: print(experiment_id, e, 'Skipping.') continue try: dx_job_id_str = idr_step_run.get('dx_applet_details')[ 0].get('dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id") logger.debug(idr_step_run) # Could try to pull it from alias. dx_job_id_str = None dx_job_id = dx_job_id_str.rpartition(':')[2] if not args.released: dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) else: analysis_link = na desc = {} # Get IDR object. idr = common.encoded_get(server + idr_qc_uri, keypair) # Pull metrics of interest. idr_status = idr.get('status', na) if (args.released and (idr_status == na or idr_status != 'released')): logger.error('%s: Expected released IDR metric. Skipping.' % idr_qc_uris) continue Np = idr.get('Np', na) N1 = idr.get('N1', na) N2 = idr.get('N2', na) Nt = idr.get('Nt', na) Fp = idr.get('Fp', na) F1 = idr.get('F1', na) F2 = idr.get('F2', na) Ft = idr.get('Ft', na) quality_metric_of = idr.get('quality_metric_of', []) date = idr.get('date_created', na) rescue_ratio = idr.get('rescue_ratio', na) self_consistency_ratio = idr.get('self_consistency_ratio', na) reproducibility_test = idr.get('reproducibility_test', na) # Get Experiment object. experiment = common.encoded_get(server + experiment_id, keypair) experiment_link = '%sexperiments/%s' % (server, experiment.get('accession')) # Get Award object. award = common.encoded_get(server + experiment.get('award'), keypair) # Grab project phase, e.g. ENCODE4. rfa = award.get('rfa', na) row = {'date': date, 'analysis': analysis_link, 'analysis_id': desc.get('id', na), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'replication': experiment.get('replication_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': rfa, 'assembly': assembly, 'Nt': Nt, 'Np': Np, 'N1': N1, 'N2': N2, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'Ft': Ft, 'Fp': Fp, 'F1': F1, 'F2': F2, 'state': desc.get('state', na), 'release': experiment['status'], 'total_price': desc.get('totalPrice', na), 'quality_metric_of': ', '.join(quality_metric_of) } writer.writerow(row) if args.create_google_sheet: args.outfile.close() # Load CSV data, sort. idr_data = pd.read_table(temp_file) idr_data = idr_data.replace('not_available', '') idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x)) idr_data = idr_data.sort_values( by=['lab', 'biosample_term_name', 'target', 'experiment'], ascending=[True, True, True, True]) idr_data.date = idr_data.date.astype('str') idr_data = idr_data.reset_index(drop=True) # Read sheet title and create unique page title. date = datetime.now().strftime('%m_%d_%Y') sheet_title = ( args.sheet_title if not args.released else '{} Released'.format(args.sheet_title) ) page_title = '%s_IDR_FRIP_%s' % (args.assembly, date) # Open/create Google Sheet. gc = pygsheets.authorize(args.apikey) try: sh = gc.open(sheet_title) except pygsheets.exceptions.SpreadsheetNotFound: sh = gc.create(sheet_title) try: wks = sh.add_worksheet(page_title) except HttpError: wks = sh.worksheet_by_title(page_title) # Clear worksheet. wks.clear() # Add data from DataFrame. wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1') # Apply formatting and conditions. header['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, header) # Format numbers. for col in number_format_columns: num = idr_data.columns.get_loc(col) number_format['repeatCell']['range']['startColumnIndex'] = num number_format['repeatCell']['range']['endColumnIndex'] = num + 1 number_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, number_format) # Resize font. font_size_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format) # Add conditional formatting. for conditional in conditions: num = idr_data.columns.get_loc("reproducibility_test") conditional['addConditionalFormatRule']['rule']['ranges'][0]['startColumnIndex'] = num conditional['addConditionalFormatRule']['rule']['ranges'][0]['endColumnIndex'] = num + 1 conditional['addConditionalFormatRule']['rule']['ranges'][0]['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, conditional) for k, v in notes_dict.items(): num = idr_data.columns.get_loc(k) note['repeatCell']['range']['startColumnIndex'] = num note['repeatCell']['range']['endColumnIndex'] = num + 1 note['repeatCell']['cell']['note'] = v note['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, note) # Optional. Smaller column width to match original. for i in range(wks.cols): wks.adjust_column_width(i, pixel_size=38) # Resize tiny columns. tiny_columns = ['experiment', 'analysis'] for i in [idr_data.columns.get_loc(x) for x in tiny_columns]: wks.adjust_column_width(i, pixel_size=25) # Resize medium columns. medium_columns = ['replication', 'assembly', 'rfa'] for i in [idr_data.columns.get_loc(x) for x in medium_columns]: wks.adjust_column_width(i, pixel_size=65) # Resize wide columns. wide_columns = ['target', 'reproducibility_test', 'lab'] for i in [idr_data.columns.get_loc(x) for x in wide_columns]: wks.adjust_column_width(i, pixel_size=85) # Remove temp file. os.remove(temp_file)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ 'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." %(desc['name'])) continue experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' %(experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' %(analysis_id)) else: if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'] for stage_name in idr_stage_names: try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r'Peak files must contain at least 20 peaks post-merge'] for p in patterns: m = re.search(p,job_log) if m: notes.append("%s: %s" %(stage_name,m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' %(experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) if args.control: control_dxhandler = resolve_dx_file(args.control) else: control_dxhandler = None for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print("Experiment %s" % (exp_id)) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print( "%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description'))) tas = get_tas(experiment, server, keypair, args.project, args.inf, control_dxhandler) if not tas: logging.error( 'Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue if not tas.get('rep2_ta'): simplicate_experiment = True print("Simplicate experiment ta's:") else: simplicate_experiment = False print("Replicated experiment ta's:") pprint(tas) # sys.exit() # continue for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/'+outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: logging.error( "%s: Failed to determine target type ... skipping" % (exp_id)) continue else: print(investigated_as) rep1_pe = tas['rep1_ta']['paired_end'] if not simplicate_experiment: rep2_pe = tas['rep2_ta']['paired_end'] else: rep2_pe = None if simplicate_experiment and rep1_pe is None: logging.error( "%s: Cannot determine paired end: rep1 PE = %s... skipping" % (exp_id, rep1_pe)) continue elif not simplicate_experiment and None in [rep1_pe, rep2_pe]: logging.error( "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if not simplicate_experiment and rep1_pe != rep2_pe: logging.error( "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (exp_id, rep1_pe, rep2_pe)) continue if any('histone' in target_type for target_type in investigated_as): logging.info( "%s: Found to be histone. No blacklist will be used." % (exp_id)) wf_target = 'histone' blacklist = None else: logging.info("Assumed to be tf") wf_target = 'tf' if not args.blacklist: if args.assembly in ASSEMBLY_METADATA: blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist'] else: logging.warning( "%s: No blacklist for assembly %s, proceeding with no blacklist" % (exp_id, args.assembly)) blacklist = None if not args.gsize: if args.assembly in ASSEMBLY_METADATA: genomesize = ASSEMBLY_METADATA[args.assembly]['gsize'] else: logging.error( "%s: Must specify -gsize for assembly %s" % (exp_id, args.assembly)) else: genomesize = args.gsize if not args.csizes: if args.assembly in ASSEMBLY_METADATA: chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes'] else: logging.error( "%s: Must specify -csizes for assembly %s" % (exp_id, args.assembly)) else: chrom_sizes = args.csizes chip_workflow_absolute_path = os.path.dirname(os.path.realpath(__file__)) + "/chip_workflow.py" command_strings = [ chip_workflow_absolute_path, '--nomap --yes', '--target %s' % (wf_target), '--title "%s"' % (workflow_title), '--outf "%s"' % (outf), '--rep1pe %s' % (str(rep1_pe).lower()), '--rep1 %s' % (tas['rep1_ta'].get('file_id')), '--ctl1 %s' % (tas['rep1_ta'].get('control_id')), '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes), '--spp_version %s' % (args.spp_version) ] if not simplicate_experiment: command_strings.extend([ '--rep2pe %s' % (str(rep2_pe).lower()), '--rep2 %s' % (tas['rep2_ta'].get('file_id')), '--ctl2 %s' % (tas['rep2_ta'].get('control_id')), ]) if args.spp_instance: command_strings.append('--spp_instance %s' % str(args.spp_instance)) if args.fragment_length: command_strings.append('--fragment_length %s' % str(args.fragment_length)) if blacklist: command_strings.append('--blacklist "%s"' % (blacklist)) if args.debug: command_strings.append('--debug') if args.use_existing_folders: command_strings.append('--use_existing_folders') if args.accession: command_strings.append('--accession') if args.fqcheck is not None: command_strings.append('--fqcheck=%s' % (args.fqcheck)) if args.skip_control is not None: command_strings.append('--skip_control=%s' % (args.skip_control)) if args.force_patch is not None: command_strings.append('--force_patch=%s' % (args.force_patch)) run_command = ' '.join(command_strings) print(run_command) if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error( "%s: chip_workflow exited with non-zero code %d" % (exp_id, e.returncode)) else: print("%s workflow created" % (experiment['accession'])) logging.debug( "%s: patching internal_status to url %s" % (exp_id, experiment_url)) r = common.encoded_patch( experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.warning( "%s: Failed to update experiment internal_status to processing. Skipping that update." % (exp_id)) logging.debug(r.text)
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile,'r') as fh: experiments.extend([e for e in fh]) for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print "Experiment %s" %(exp_id) experiment_url = server + '/experiments/%s/' %(exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print "%s %s %s" %(experiment['accession'], target.get('investigated_as'), experiment.get('description')) # ctl_id = get_control_id(experiment) # if ctl_id: # print "Control %s" %(ctl_id) # else: # print "Found no control ... skipping %s" %(exp_id) # continue # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf) # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf) tas = get_tas(experiment, server, keypair, args.project, args.inf) if not tas: logging.error('Failed to resolve all tagaligns for %s' %(experiment['accession'])) continue pprint.pprint(tas) # sys.exit() #continue skip_flag = False for key,value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' %(key)) skip_flag = True if skip_flag: continue workflow_title = '%s Peaks' %(exp_id) if args.tag: workflow_title += ' %s' %(args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/'+outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' %(exp_id) try: investigated_as = target['investigated_as'] except: print "%s: Failed to determine target type ... skipping" %(exp_id) continue else: print investigated_as rep1_pe = tas['rep1_ta']['paired_end'] rep2_pe = tas['rep2_ta']['paired_end'] if None in [rep1_pe, rep2_pe]: print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if rep1_pe != rep2_pe: print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if any('histone' in target_type for target_type in investigated_as): print "Found to be histone. No blacklist will be used." IDR_default = False workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py' blacklist = None else: print "Assumed to be tf" IDR_default = True workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py' if args.assembly == "hg19": blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz" else: print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" %(args.assembly) blacklist = None run_command = \ '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \ '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \ '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \ '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \ '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes) if blacklist: run_command += ' --blacklist "%s"' %(blacklist) if args.debug: run_command += ' --debug' if args.idr or IDR_default: run_command += ' --idr --idrversion %s' %(args.idrversion) print run_command if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error("%s exited with non-zero code %d" %(workflow_spinner, e.returncode)) else: print "%s workflow created" %(experiment['accession']) logging.debug("patching internal_status to url %s" %(experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status':'processing'}, return_response=True) try: r.raise_for_status() except: logging.error("Tried but failed to update experiment internal_status to processing") logging.error(r.text)
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: exp_ids = csv.reader( StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for row in exp_ids: if row[0].startswith('#'): continue exp_id = row[0].strip() if len(row) > 1: repns = [] for s in row[1:]: repns.extend(s.split(',')) map_only_reps = list(set([int(s) for s in repns])) else: map_only_reps = [] outstrings = [] encode_url = urlparse.urljoin(server, exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.no_sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, map_only_reps) biorep_numbers = \ set([rep.get('biological_replicate_number') for rep in replicates]) in_process = False if files: for biorep_n in biorep_numbers: outstrings.append('rep%s' % (biorep_n)) biorep_files = [ f for f in files if biorep_n in common.biorep_ns(f, server, keypair) ] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get( 'paired_end' ) == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1', '2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get( '@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get( 'paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' % (experiment.get('accession'), file_object.get('accession'))) mate = {} # if mapping as SE, ignore the mate and just map the # rep1 as SE with all the other SE for this rep, if any if args.force_se: unpaired_files.append( next(f for f in [file_object, mate] if f.get('paired_end') == '1')) else: paired_files.append((file_object, mate)) if biorep_files: logging.warning( '%s: leftover file(s) %s' % (experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = \ map_only(experiment, biorep_n, paired_files, server, keypair, args.sex_specific, args.crop_length, args.accession, args.fqcheck, args.force_patch, args.use_existing_folders, args.encoded_check) in_process = True if unpaired_files: se_jobs = \ map_only(experiment, biorep_n, unpaired_files, server, keypair, args.sex_specific, args.crop_length, args.accession, args.fqcheck, args.force_patch, args.use_existing_folders, args.encoded_check) in_process = True if paired_files and pe_jobs: outstrings.append( 'paired:%s' % ([(a.get('accession'), b.get('accession')) for (a, b) in paired_files])) outstrings.append('paired jobs:%s' % ([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' % (None)) if unpaired_files and se_jobs: outstrings.append( 'unpaired:%s' % ([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' % ([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' % (None)) if in_process: r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True) try: r.raise_for_status() except: logging.error("Tried and failed to set internal_status") logging.error(r.text) print('\t'.join(outstrings)) else: # no files if not replicates: logging.warning('%s: No files and no replicates' % experiment.get('accession')) else: logging.warning('%s: No files to map' % experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' % experiment.get('accession'))
def main(): global args args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: exp_ids = csv.reader( StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments]))) else: exp_ids = csv.reader(args.infile) for instring in exp_ids: exp_id = instring[0].strip() if len(instring) > 1: repns = [] for s in instring[1:]: repns.extend(s.split(',')) biorep_ns = list(set([int(s) for s in repns])) else: biorep_ns = [] outstrings = [] encode_url = urlparse.urljoin(server, exp_id) experiment = common.encoded_get(encode_url, keypair) outstrings.append(exp_id) files = files_to_map(experiment, server, keypair, args.sfn_dupes) outstrings.append(str(len(files))) outstrings.append(str([f.get('accession') for f in files])) replicates = replicates_to_map(files, server, keypair, biorep_ns) if files: for biorep_n in set( [rep.get('biological_replicate_number') for rep in replicates]): outstrings.append('rep%s' % (biorep_n)) biorep_files = [ f for f in files if biorep_n in common.biorep_ns(f, server, keypair) ] paired_files = [] unpaired_files = [] while biorep_files: file_object = biorep_files.pop() if file_object.get( 'paired_end' ) == None: # group all the unpaired reads for this biorep together unpaired_files.append(file_object) elif file_object.get('paired_end') in ['1', '2']: if file_object.get('paired_with'): mate = next((f for f in biorep_files if f.get( '@id') == file_object.get('paired_with')), None) else: #have to find the file that is paired with this one mate = next((f for f in biorep_files if f.get( 'paired_with') == file_object.get('@id')), None) if mate: biorep_files.remove(mate) else: logging.warning('%s:%s could not find mate' % (experiment.get('accession'), file_object.get('accession'))) mate = {} paired_files.append((file_object, mate)) if biorep_files: logging.warning( '%s: leftover file(s) %s' % (experiment.get('accession'), biorep_files)) if paired_files: pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair) if unpaired_files: se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair) if paired_files and pe_jobs: outstrings.append( 'paired:%s' % ([(a.get('accession'), b.get('accession')) for (a, b) in paired_files])) outstrings.append('paired jobs:%s' % ([j.get_id() for j in pe_jobs])) else: outstrings.append('paired:%s' % (None)) if unpaired_files and se_jobs: outstrings.append( 'unpaired:%s' % ([f.get('accession') for f in unpaired_files])) outstrings.append('unpaired jobs:%s' % ([j.get_id() for j in se_jobs])) else: outstrings.append('unpaired:%s' % (None)) print '\t'.join(outstrings) else: # no files if not replicates: logging.warning('%s: No files and no replicates' % experiment.get('accession')) else: logging.warning('%s: No files to map' % experiment.get('accession')) if files and not replicates: logging.warning('%s: Files but no replicates' % experiment.get('accession'))
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) for exp_id in args.infile: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print "Experiment %s" %(exp_id) url = server + '/experiments/%s/' %(exp_id) experiment = common.encoded_get(url, keypair) if experiment.get('target'): url = server + experiment.get('target') target = common.encoded_get(url, keypair) else: logging.error('Experiment has no target ... skipping') continue print "%s %s %s" %(experiment['accession'], target.get('investigated_as'), experiment.get('description')) # ctl_id = get_control_id(experiment) # if ctl_id: # print "Control %s" %(ctl_id) # else: # print "Found no control ... skipping %s" %(exp_id) # continue # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf) # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf) tas = get_tas(experiment, server, keypair, args.project, args.inf) if not tas: logging.error('Failed to resolve all tagaligns for %s' %(experiment['accession'])) continue pprint.pprint(tas) # sys.exit() #continue skip_flag = False for key,value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' %(key)) skip_flag = True if skip_flag: continue workflow_name = '%s Peaks' %(exp_id) if args.tag: workflow_name += ' %s' %(args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/'+outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' %(exp_id) try: investigated_as = target['investigated_as'] except: print "Failed to determine target type ... skipping %s" %(exp_id) continue else: print investigated_as if any('histone' in target_type for target_type in investigated_as): print "Found to be histone" workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py' else: print "Assumed to be tf" workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py' run_command = \ '%s --name "%s" --outf "%s" --nomap --yes ' %(workflow_spinner, workflow_name, outf) + \ '--rep1pe false --rep2pe false ' + \ '--rep1 %s --rep2 %s ' %(tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \ '--ctl1 %s --ctl2 %s ' %(tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \ '--genomesize %s --chrom_sizes "%s" ' %(args.gsize, args.csizes) + \ '--blacklist "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz"' if args.debug: run_command += ' --debug' if args.idr: run_command += ' --idr --idrversion %s' %(args.idrversion) print run_command if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error("%s exited with non-zero code %d" %(workflow_spinner, e.returncode)) else: print "%s workflow created" %(experiment['accession'])
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.query: r = requests.get( args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"} ) experiments = r.json()["@graph"] exp_ids = [e["accession"] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() logger.info("%s" % (exp_id)) url = urlparse.urljoin(server, "/experiments/%s" % (exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [ common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair) for uri in experiment_object.get("original_files") ] bams = [ f for f in original_files if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"] ] fastqs = [ f for f in original_files if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"] ] for f in fastqs: f["replicate"] = common.encoded_get(urlparse.urljoin(server, "%s" % (f.get("replicate"))), keypair) for bam in bams: bioreps = common.biorep_ns(bam.get("accession"), server, keypair) if len(bioreps) != 1: logger.error( "Expected to find 1 biorep for bam %s, found %d. Skipping." % (bam.get("accession"), len(bioreps)) ) continue else: bam_biorep = bioreps[0] try: derived_from = [ common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair) for uri in bam.get("derived_from") ] except: derived_from = None if not derived_from: logger.error("bam %s is derived from nothing. Skipping" % (bam.get("accession"))) continue for f in derived_from: if f.get("file_format") != "fastq": logger.error( "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." % (bam.get("accession"), f.get("accession")) ) continue try: if common.after(f.get("date_created"), bam.get("date_created")): logger.error( "Date conflict. Bam %s is derived from newer Fastq %s" % (bam.get("accession"), f.get("accession")) ) except: logger.error( "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." % (bam.get("date_created"), f.get("date_created")) ) continue for f in fastqs: if f.get("replicate").get("biological_replicate_number") == bam_biorep: if common.after(f.get("date_created"), bam.get("date_created")): logger.info( "bam %s is out-of-date. fastq %s is newer" % (bam.get("accession"), f.get("accession")) ) if re.search("control", experiment_object.get("target").lower()): logger.info( "WARNING, %s is a control experiment so many other experiments may be out-of-date." % (experiment_object.get("accession")) )
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses(name="ENCSR*", name_mode='glob', state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after))) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith( 'ENCSR783QUL Peaks') ] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply analysis id's in arguments, --infile or supply search string in --created_after" ) fieldnames = [ 'date', 'analysis', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(outfn, assembly, debug, key, keyfile, dryrun, force, pipeline, analysis_ids=None, infile=None, project=None): if debug: logger.info('setting logger level to logging.DEBUG') logger.setLevel(logging.DEBUG) else: logger.info('setting logger level to logging.INFO') logger.setLevel(logging.INFO) if infile is not None: infile = dxpy.DXFile(infile) dxpy.download_dxfile(infile.get_id(), "infile") ids = open("infile",'r') elif analysis_ids is not None: ids = analysis_ids else: logger.error("Must supply one of --infile or a list of one or more analysis-ids") return authid, authpw, server = common.processkey(key, keyfile) keypair = (authid,authpw) common_metadata.update({'assembly': assembly}) with open(outfn, 'w') as fh: if dryrun: fh.write('---DRYRUN: No files have been modified---\n') fieldnames = ['analysis','experiment','assembly','dx_pipeline','files','error'] output_writer = csv.DictWriter(fh, fieldnames, delimiter='\t') output_writer.writeheader() for (i, analysis_id) in enumerate(ids): logger.debug('debug %s' %(analysis_id)) analysis = dxpy.describe(analysis_id.strip()) experiment = get_experiment_accession(analysis) output = { 'analysis': analysis_id, 'experiment': experiment, 'assembly': assembly } logger.info('Accessioning analysis name %s executableName %s' %(analysis.get('name'), analysis.get('executableName'))) if analysis.get('name') == 'histone_chip_seq': output.update({'dx_pipeline':'histone_chip_seq'}) try: accessioned_files = accession_peaks_analysis_files(analysis, keypair, server, dryrun, force) except: accessioned_files = None output.update({'error':sys.exc_info()[0]}) else: output.update({'error':""}) elif analysis.get('executableName') == 'ENCODE mapping pipeline': output.update({'dx_pipeline':'ENCODE mapping pipeline'}) try: accessioned_files = accession_mapping_analysis_files(analysis, keypair, server, dryrun, force) except: accessioned_files = None output.update({'error':sys.exc_info()[0]}) else: output.update({'error':""}) else: logger.error('unrecognized analysis pattern %s %s ... skipping.' %(analysis.get('name'), analysis.get('executableName'))) output.update({'dx_pipeline':'unrecognized'}) accessioned_files = None output.update({'error':'unrecognized analysis pattern %s %s' %(analysis.get('name'), analysis.get('executableName'))}) file_accessions = [f.get('accession') for f in (accessioned_files or [])] logger.info("Accessioned: %s" %(file_accessions)) output.update({'files':file_accessions}) output_writer.writerow(output) common.touch(outfn) outfile = dxpy.upload_local_file(outfn) output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses( name="ENCSR*", name_mode="glob", state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after), ) ) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()["executableName"] == "tf_chip_seq" or analysis.describe()["executableName"].startswith("ENCSR783QUL Peaks") ] elif args.infile: ids = args.infile else: # never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ "name", "date", "analysis", "experiment", "target", "biosample_term_name", "biosample_type", "lab", "rfa", "assembly", "Nt", "Np", "N1", "N2", "rescue_ratio", "self_consistency_ratio", "reproducibility_test", "state", "release", "total price", "notes", ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter="\t", quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith("#"): continue analysis_id = analysis_id.rstrip() logger.debug("%s" % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get("project") m = re.match("^(ENCSR[0-9]{3}[A-Z]{3}) Peaks", desc["name"]) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc["name"])) continue experiment = common.encoded_get(urlparse.urljoin(server, "/experiments/%s" % (experiment_accession)), keypair) logger.debug("ENCODEd experiment %s" % (experiment["accession"])) if args.lab and experiment["lab"].split("/")[2] not in args.lab: continue try: idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == "Final IDR peak calls") except: logging.error("Failed to find final IDR stage in %s" % (analysis_id)) else: if ( idr_stage["state"] != "done" ): # Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] # note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ "IDR True Replicates", "IDR Rep 1 Self-pseudoreplicates", "IDR Rep 2 Self-pseudoreplicates", "IDR Pooled Pseudoreplicates", "IDR Pooled Pseudoeplicates", ] for stage_name in idr_stage_names: try: idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == stage_name) except StopIteration: continue except: raise if idr_stage["state"] == "failed": try: job_log = subprocess.check_output( "dx watch %s" % (idr_stage["id"]), shell=True, stderr=subprocess.STDOUT ) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r"Peak files must contain at least 20 peaks post-merge"] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage["failureMessage"]) try: done_time = next( transition["setAt"] for transition in desc["stateTransitions"] if transition["newState"] == "failed" ) except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage["output"].get("Np") N1 = idr_stage["output"].get("N1") N2 = idr_stage["output"].get("N2") Nt = idr_stage["output"].get("Nt") rescue_ratio = idr_stage["output"].get("rescue_ratio") self_consistency_ratio = idr_stage["output"].get("self_consistency_ratio") reproducibility_test = idr_stage["output"].get("reproducibility_test") notes = "IDR Complete" done_time = next( transition["setAt"] for transition in desc["stateTransitions"] if transition["newState"] == "done" ) if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = "https://platform.dnanexus.com/projects/%s/monitor/analysis/%s" % ( desc.get("project").split("-")[1], desc.get("id").split("-")[1], ) experiment_link = "https://www.encodeproject.org/experiments/%s" % (experiment.get("accession")) row = { "name": desc.get("name"), "date": date, "analysis": analysis_link, "experiment": experiment_link, "target": experiment["target"].split("/")[2], "biosample_term_name": experiment.get("biosample_term_name"), "biosample_type": experiment.get("biosample_type"), "lab": experiment["lab"].split("/")[2], "rfa": common.encoded_get(server + experiment.get("award"), keypair).get("rfa"), "assembly": args.assembly, # TODO ... derive this from the analysis "Np": Np, "N1": N1, "N2": N2, "Nt": Nt, "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility_test, "state": desc.get("state"), "release": experiment["status"], "total price": desc.get("totalPrice"), } if notes: row.update({"notes": "%s" % (notes)}) else: row.update({"notes": "%s" % ("OK")}) # log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) experiments = [] if args.experiments: experiments.extend(args.experiments) if args.infile: with open(args.infile, 'r') as fh: experiments.extend([e for e in fh]) for exp_id in experiments: if exp_id.startswith('#'): continue exp_id = exp_id.rstrip() print "Experiment %s" % (exp_id) experiment_url = server + '/experiments/%s/' % (exp_id) experiment = common.encoded_get(experiment_url, keypair) if experiment.get('target'): target_url = server + experiment.get('target') target = common.encoded_get(target_url, keypair) else: logging.error('Experiment has no target ... skipping') continue print "%s %s %s" % (experiment['accession'], target.get('investigated_as'), experiment.get('description')) # ctl_id = get_control_id(experiment) # if ctl_id: # print "Control %s" %(ctl_id) # else: # print "Found no control ... skipping %s" %(exp_id) # continue # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf) # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf) tas = get_tas(experiment, server, keypair, args.project, args.inf) if not tas: logging.error('Failed to resolve all tagaligns for %s' % (experiment['accession'])) continue pprint.pprint(tas) # sys.exit() #continue skip_flag = False for key, value in tas.iteritems(): if not value: logging.error('Missing %s ... skipping' % (key)) skip_flag = True if skip_flag: continue workflow_title = '%s Peaks' % (exp_id) if args.tag: workflow_title += ' %s' % (args.tag) outf = args.outf if not outf.startswith('/') and outf != '/': outf = '/' + outf if not outf.endswith('/') and outf != '/': outf += '/' outf += '%s/peaks/' % (exp_id) try: investigated_as = target['investigated_as'] except: print "%s: Failed to determine target type ... skipping" % (exp_id) continue else: print investigated_as rep1_pe = tas['rep1_ta']['paired_end'] rep2_pe = tas['rep2_ta']['paired_end'] if None in [rep1_pe, rep2_pe]: print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if rep1_pe != rep2_pe: print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % ( exp_id, rep1_pe, rep2_pe) continue if any('histone' in target_type for target_type in investigated_as): print "Found to be histone. No blacklist will be used." IDR_default = False workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py' blacklist = None else: print "Assumed to be tf" IDR_default = True workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py' if args.assembly == "hg19": blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz" else: print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" % ( args.assembly) blacklist = None run_command = \ '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \ '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \ '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \ '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \ '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes) if blacklist: run_command += ' --blacklist "%s"' % (blacklist) if args.debug: run_command += ' --debug' if args.idr or IDR_default: run_command += ' --idr --idrversion %s' % (args.idrversion) print run_command if args.dryrun: logging.info('Dryrun') else: try: subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as e: logging.error("%s exited with non-zero code %d" % (workflow_spinner, e.returncode)) else: print "%s workflow created" % (experiment['accession']) logging.debug("patching internal_status to url %s" % (experiment_url)) r = common.encoded_patch(experiment_url, keypair, {'internal_status': 'processing'}, return_response=True) try: r.raise_for_status() except: logging.error( "Tried but failed to update experiment internal_status to processing" ) logging.error(r.text)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.query: r = requests.get(args.query, auth=keypair, headers={ 'content-type': 'application/json', 'accept': 'application/json' }) experiments = r.json()['@graph'] exp_ids = [e['accession'] for e in experiments] elif args.experiments: exp_ids = args.experiments else: exp_ids = args.infile logger.info('Checking %d experiments' % (len(exp_ids))) for (i, exp_id) in enumerate(exp_ids): exp_id = exp_id.strip() #logger.info('%s' %(exp_id)) url = urlparse.urljoin(server, '/experiments/%s' % (exp_id)) experiment_object = common.encoded_get(url, keypair) original_files = [ common.encoded_get(urlparse.urljoin(server, '%s' % (uri)), keypair) for uri in experiment_object.get('original_files') ] bams = [ f for f in original_files if f.get('file_format') == 'bam' and f.get('status') not in ['revoked', 'deleted', 'replaced'] ] fastqs = [ f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') not in ['revoked', 'deleted', 'replaced'] ] for f in fastqs: f['replicate'] = common.encoded_get( urlparse.urljoin(server, '%s' % (f.get('replicate'))), keypair) for bam in bams: bioreps = common.biorep_ns(bam.get('accession'), server, keypair) if len(bioreps) != 1: logger.error( "Expected to find 1 biorep for bam %s, found %s. Skipping." % (bam.get('accession'), bioreps)) continue else: bam_biorep = bioreps[0] try: derived_from = [ common.encoded_get(urlparse.urljoin(server, '%s' % (uri)), keypair) for uri in bam.get('derived_from') ] except: derived_from = None if not derived_from: logger.error('bam %s is derived from nothing. Skipping' % (bam.get('accession'))) continue for f in derived_from: if f.get('output_category') == 'reference': continue if f.get('file_format') != 'fastq': logger.error( "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." % (bam.get('accession'), f.get('accession'))) continue try: if common.after(f.get('date_created'), bam.get('date_created')): logger.error( "Date conflict. Bam %s is derived from newer Fastq %s" % (bam.get('accession'), f.get('accession'))) except: logger.error( "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." % (bam.get('date_created'), f.get('date_created'))) continue for f in fastqs: if f.get('replicate').get( 'biological_replicate_number') == bam_biorep: if common.after(f.get('date_created'), bam.get('date_created')): logger.info( "bam %s is out-of-date. fastq %s is newer" % (bam.get('accession'), f.get('accession'))) if re.search('control', experiment_object.get('target').lower()): logger.info( "WARNING, %s is a control experiment so many other experiments may be out-of-date." % (experiment_object.get('accession')))