def main(outfn, assembly, debug, key, keyfile, dryrun, force, analysis_ids=None, infile=None, project=None):

	if debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	if infile is not None:
		infile = dxpy.DXFile(infile)
		dxpy.download_dxfile(infile.get_id(), "infile")
		ids = open("infile",'r')
	elif analysis_ids is not None:
		ids = analysis_ids
	else:
		logger.error("Must supply one of --infile or a list of one or more analysis-ids")
		return

	authid, authpw, server = common.processkey(key, keyfile)
	keypair = (authid,authpw)

	for (i, analysis_id) in enumerate(ids):
		logger.info('%s' %(analysis_id))
		accessioned_files = accession_analysis(analysis_id, keypair, server, assembly, dryrun, force)

	print accessioned_files
	common.touch(outfn)
	outfile = dxpy.upload_local_file(outfn)

	output = {}
	output["outfile"] = dxpy.dxlink(outfile)

	return output
示例#2
0
def s3cp(accession, key=None):

    (AUTHID, AUTHPW, SERVER) = common.processkey(key, KEYFILE)
    keypair = (AUTHID, AUTHPW)

    url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' % (
        accession)
    #get the file object
    response = common.encoded_get(url, keypair)
    logger.debug(response)

    #select your file
    result = response.get('@graph')
    if not result:
        logger.error('Failed to find %s at %s' % (accession, url))
        return None
    else:
        f_obj = result[0]
        logger.debug(f_obj)

    #make the URL that will get redirected - get it from the file object's href property
    encode_url = urlparse.urljoin(SERVER, f_obj.get('href'))
    logger.debug("URL: %s" % (encode_url))
    logger.debug("%s:%s" % (AUTHID, AUTHPW))
    #stream=True avoids actually downloading the file, but it evaluates the redirection
    r = requests.get(encode_url,
                     auth=(AUTHID, AUTHPW),
                     headers={'content-type': 'application/json'},
                     allow_redirects=True,
                     stream=True)
    try:
        r.raise_for_status
    except:
        logger.error('%s href does not resolve' % (f_obj.get('accession')))
    logger.debug("Response: %s", (r))

    #this is the actual S3 https URL after redirection
    s3_url = r.url
    logger.debug(s3_url)

    #release the connection
    r.close()

    #split up the url into components
    o = urlparse.urlparse(s3_url)

    #pull out the filename
    filename = os.path.basename(o.path)

    #hack together the s3 cp url (with the s3 method instead of https)
    bucket_url = S3_SERVER.rstrip('/') + o.path

    #cp the file from the bucket
    subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)),
                          stderr=subprocess.STDOUT)
    subprocess.check_call(shlex.split('ls -l %s' % (filename)))

    dx_file = dxpy.upload_local_file(filename)

    return dx_file
def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.analysis_ids:
		ids = args.analysis_ids
	else:
		ids = args.infile

	formats = ['bed_narrowPeak', 'bed_gappedPeak']
	fieldnames = ['file','analysis','experiment','replicates','output_name','file_format','output_type','target','biosample_term_name','biosample_term_id','biosample_type','biosample_life_stage','biosample_age','biosample_organism']
	writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t')
	writer.writeheader()
	for (i, analysis_id) in enumerate(ids):
		analysis_id = analysis_id.rstrip()
		logger.info('%s' %(analysis_id))
		try:
			files = analysis_files(analysis_id, keypair, server, args.assembly)
		except:
			logger.error('%s error finding analysis_files.  Check experiment metadata.' %(analysis_id))
		for f in [f_obj for f_obj in files if f_obj.get('file_format') in formats]:
			fid = f['dx'].get_id()
			local_path = os.path.join(args.outdir,fid)
			if not os.path.isfile(local_path):
				if not os.path.exists(args.outdir):
					os.makedirs(args.outdir)
				dxpy.download_dxfile(fid, local_path)
			replicates = []
			for derived_from in f['derived_from']:
				rep_ns = common.biorep_ns(derived_from, server, keypair)
				for r in rep_ns:
					replicates.append(r)
			experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(f['dataset'])), keypair)
			rep = common.encoded_get(urlparse.urljoin(server, experiment['replicates'][0]), keypair)
			lib = common.encoded_get(urlparse.urljoin(server, rep['library']), keypair)
			biosample = common.encoded_get(urlparse.urljoin(server, lib['biosample']), keypair)
			writer.writerow({
				'file': fid,
				'analysis': analysis_id,
				'experiment': experiment.get('accession'),
				'replicates': replicates,
				'output_name': f.get('name'),
				'file_format': f.get('file_format'),
				'output_type': f.get('output_type'),
				'target': experiment.get('target'),
				'biosample_term_name': experiment.get('biosample_term_name'),
				'biosample_term_id': experiment.get('biosample_term_id'),
				'biosample_type': experiment.get('biosample_type'),
				'biosample_life_stage': biosample.get('life_stage'),
				'biosample_age': biosample.get('age'),
				'biosample_organism': biosample.get('organism')})
示例#4
0
def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.experiments:
		exp_ids = args.experiments
	else:
		exp_ids = args.infile

	for (i, exp_id) in enumerate(exp_ids):
		exp_id = exp_id.rstrip()
		logger.info('%s' %(exp_id))
		url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id))
		r = requests.get(url, auth=keypair)
		try:
			r.raise_for_status()
		except:
			logger.error('%s failed to get metadata.  GET returned %s' %(exp_id, r.return_code))
			logger.debug('%s' %(r.text))
			logger.error('Skipping ...')
			continue

		reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t')
		fieldnames = copy.copy(reader.fieldnames)
		fieldnames.remove('Biological replicate(s)')
		fieldnames.insert(4,'Biological replicate(s)')
		fieldnames.remove('Biosample Age')
		fieldnames.insert(10,'Biosample Age')
		fieldnames.append('Derived from')
		writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t')
		writer.writeheader()
		for file_metadata in reader:
			file_accession = file_metadata.get('File accession')
			url = urlparse.urljoin(server, 'files/%s' %(file_accession))
			file_object = common.encoded_get(url, keypair)
			
			bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair))))
			file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps])

			bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or ""
			file_metadata.update({'Biosample Age': ",".join(bio_ages)})
			
			if file_object.get('derived_from'):
				derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')])
			else:
				derived_from = None
			file_metadata.update({'Derived from': derived_from})

			#print file_metadata
			writer.writerow(file_metadata)
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.rstrip()
        logger.info('%s' %(exp_id))
        url = urlparse.urljoin(server, 'metadata/type=experiment&accession=%s/metadata.tsv' %(exp_id))
        r = requests.get(url, auth=keypair)
        try:
            r.raise_for_status()
        except:
            logger.error('%s failed to get metadata.  GET returned %s' %(exp_id, r.return_code))
            logger.debug('%s' %(r.text))
            logger.error('Skipping ...')
            continue

        reader = csv.DictReader(StringIO.StringIO(r.text), delimiter='\t')
        fieldnames = copy.copy(reader.fieldnames)
        # fieldnames.remove('Biological replicate(s)')
        # fieldnames.insert(4,'Biological replicate(s)')
        # fieldnames.remove('Biosample Age')
        # fieldnames.insert(10,'Biosample Age')
        fieldnames.append('Derived from')
        writer = csv.DictWriter(args.outfile,fieldnames, delimiter='\t')
        writer.writeheader()
        for file_metadata in reader:
            file_accession = file_metadata.get('File accession')
            url = urlparse.urljoin(server, 'files/%s' %(file_accession))
            file_object = common.encoded_get(url, keypair)
            
            # bio_reps = sorted(list(set(biorep_ns(file_accession, server, keypair))))
            # file_metadata['Biological replicate(s)'] = ",".join([str(n) for n in bio_reps])

            # bio_ages = sorted(list(set(biorep_ages(file_accession, server, keypair)))) or ""
            # file_metadata.update({'Biosample Age': ",".join(bio_ages)})
            
            if file_object.get('derived_from'):
                derived_from = ",".join([str(f.split('/')[2]) for f in file_object.get('derived_from')])
            else:
                derived_from = None
            file_metadata.update({'Derived from': derived_from})

            #print file_metadata
            writer.writerow(file_metadata)
示例#6
0
def main(**kwargs):

    dxpy.download_folder(DCC_CREDENTIALS_PROJECT,
                         '.',
                         folder=DCC_CREDENTIALS_FOLDER)
    if 'key' in kwargs:
        key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')])
    else:
        key = dxpy.api.system_whoami()['id']
    key_tuple = common.processkey(key, KEYFILE)
    if not key_tuple:
        logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE))
        raise PortalCredentialsError("Supply a valid keypair ID")
    authid, authpw, server = key_tuple
    if 'url' in kwargs:
        server = kwargs.pop('url')
    keypair = (authid, authpw)

    tokens = ['python3 checkfiles.py']
    for k, v in kwargs.iteritems():
        if isinstance(v, bool):
            if v:
                tokens.append("--" + k.replace('_', '-'))
            continue
        if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int):
            tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)]))

    if 'dx_file' in kwargs:
        dxfile = dxpy.DXFile(kwargs.get('dx_file'))
        local_file = dxpy.download_dxfile(dxfile, dxfile.name)
        tokens.append("--local-file %s" % (dxfile.name))

    # this is just to get a command string to print that has no secrets
    tokens_safe = deepcopy(tokens)
    tokens_safe.append("--username %s --password %s" %
                       ("." * len(authid), "." * len(authpw)))
    tokens_safe.append(server)
    logger.info(' '.join(tokens_safe))

    tokens.append("--username %s --password %s" % (authid, authpw))
    # this needs to be the last token
    tokens.append(server)

    checkfiles_command = ' '.join(tokens)
    subprocess.check_call(shlex.split(checkfiles_command))

    output = {}
    outfilename = kwargs.get('out')
    errfilename = kwargs.get('err')
    if outfilename:
        out = dxpy.upload_local_file(outfilename)
        output.update({'out': dxpy.dxlink(out)})
    if errfilename:
        err = dxpy.upload_local_file(errfilename)
        output.update({'err': dxpy.dxlink(err)})

    return output
def main():
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)
    project = resolve_project(args.project)
    SRR_files = dxpy.find_data_objects(
                    name="SRR???????_?.fastq.gz", name_mode='glob',
                    classname='file', recurse=True, return_handler=True,
                    folder=args.folder, project=args.project)
    for srr_dxfile in SRR_files:
        m = re.search('(SRR.{7})_(\d)', srr_dxfile.name)
        if m:
            srr_basename = m.group(1)
            end_num = m.group(2)
        else:
            assert m
        srr_encfiles = common.encoded_get('/'.join([server,'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename)]), keypair)['@graph']
        if not srr_encfiles:
            logging.error('%s object not found at ENCODE.  Skipping.' % (srr_basename))
            continue
        elif len(srr_encfiles) > 1:
            logging.error('%s multiple matching objects found at ENCODE.  Skipping.' % (srr_basename))
            continue
        else:
            srr_encfile = srr_encfiles[0]
        # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair)
        # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair)
        # biorep_n = replicate.get('biological_replicate_number')
        all_fastqs = common.encoded_get('/'.join([
            server,
            'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename)
        ]), keypair)['@graph']
        if not all_fastqs:
            print("%s: no fastq(s) found.  Skipping." % (srr_dxfile.name))
            continue
        if end_num == '1':
            fastqs = [f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num]
        elif end_num in ['2', '3']:
            fastqs = [f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2']
        if not fastqs:
            print("%s: no fastq(s) found for paired_end %s.  Skipping" % (srr_basename, end_num))
            continue
        elif len(fastqs) > 1:
            print("%s: ambiguous matches to %s.  Skipping" % (srr_basename, [f.get('accession') for f in fastqs]))
            continue
        else:
            fastq = fastqs[0]
            newname = '%s.fastq.gz' % (fastq.get('accession'))
            if args.dry_run:
                print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname))
            else:
                srr_dxfile.set_properties({'srr_filename': srr_dxfile.name})
                srr_dxfile.rename(newname)
                print('%s renamed to %s' % (srr_dxfile.name, newname))
示例#8
0
def s3_dxcp(accession, key=None):

    (AUTHID,AUTHPW,SERVER) = common.processkey(key,KEYFILE)
    keypair = (AUTHID,AUTHPW)

    url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' %(accession)
    #get the file object
    response = common.encoded_get(url, keypair)
    logger.debug(response)

    #select your file
    result = response.get('@graph')
    if not result:
        logger.error('Failed to find %s at %s' %(accession, url))
        return None
    else:
        f_obj = result[0]
        logger.debug(f_obj)

    #make the URL that will get redirected - get it from the file object's href property
    encode_url = urlparse.urljoin(SERVER,f_obj.get('href'))
    logger.debug("URL: %s" %(encode_url))
    logger.debug("%s:%s" %(AUTHID, AUTHPW))
    #stream=True avoids actually downloading the file, but it evaluates the redirection
    r = requests.get(encode_url, auth=(AUTHID,AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True)
    try:
        r.raise_for_status
    except:
        logger.error('%s href does not resolve' %(f_obj.get('accession')))
    logger.debug("Response: %s", (r))

    #this is the actual S3 https URL after redirection
    s3_url = r.url
    logger.debug(s3_url)

    #release the connection
    r.close()

    #split up the url into components
    o = urlparse.urlparse(s3_url)

    #pull out the filename
    filename = os.path.basename(o.path)

    #hack together the s3 cp url (with the s3 method instead of https)
    bucket_url = S3_SERVER.rstrip('/') + o.path

    #cp the file from the bucket
    subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT)
    subprocess.check_call(shlex.split('ls -l %s' %(filename)))

    dx_file = dxpy.upload_local_file(filename)

    return dx_file
示例#9
0
def main():
    args = get_args()
    logging.basicConfig(level=args.log_level)
    authid, authpw, base_url = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)
    experiment_data, file_data = get_experiments_and_files(
        base_url, keypair, args.report_type, args.assembly)
    references_data = get_references_data(base_url, keypair, args.report_type)
    build_rows = get_row_builder(args.report_type)
    rows = build_rows(experiment_data, file_data, references_data,
                      args.report_type, base_url, args)
    df = pd.DataFrame(rows)
    df = format_dataframe(df, args.report_type, base_url, args.output_type)
    outputter = get_outputter(args.output_type)
    outputter(df, args)
def main(outfn,
         assembly,
         debug,
         key,
         keyfile,
         dryrun,
         force,
         analysis_ids=None,
         infile=None,
         project=None):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    if infile is not None:
        infile = dxpy.DXFile(infile)
        dxpy.download_dxfile(infile.get_id(), "infile")
        ids = open("infile", 'r')
    elif analysis_ids is not None:
        ids = analysis_ids
    else:
        logger.error(
            "Must supply one of --infile or a list of one or more analysis-ids"
        )
        return

    authid, authpw, server = common.processkey(key, keyfile)
    keypair = (authid, authpw)

    for (i, analysis_id) in enumerate(ids):
        logger.info('%s' % (analysis_id))
        accessioned_files = accession_analysis(analysis_id, keypair, server,
                                               assembly, dryrun, force)

    print accessioned_files
    common.touch(outfn)
    outfile = dxpy.upload_local_file(outfn)

    output = {}
    output["outfile"] = dxpy.dxlink(outfile)

    return output
def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.analysis_ids:
		ids = args.analysis_ids
	else:
		ids = args.infile

	for (i, analysis_id) in enumerate(ids):
		logger.info('%s' %(analysis_id))
		accessioned_files = accession_analysis(analysis_id, keypair, server, args.assembly, args.dryrun, args.force)
def main():
	args = get_args()
	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.infile and args.experiments:
		experiments = args.experiments
		experiments.extend([e.strip() for e in args.infile if e.strip()])
	elif args.infile:
		experiments = args.infile
	else:
		experiments = args.experiments

	for exp_id in experiments:
		uri = '/experiments/%s' %(exp_id)
		experiment = common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair)
		if experiment.get('status') == 'error':
			print experiment
			print "Error fetching %s ... skipping" %(exp_id)
			continue

		print experiment.get('accession')
		for uri in experiment['original_files']:
			url = urlparse.urljoin(server,'%s' %(uri))
			file_obj = common.encoded_get(url, keypair)
			print "%s, %s, %s, %s, %s, %s" %(file_obj.get('accession'),file_obj.get('file_type'),file_obj.get('file_format'),file_obj.get('file_format_type'),file_obj.get('output_type'),file_obj.get('status'))
			if file_obj.get('file_format') in ['bed', 'bigBed', 'bigWig']:
				if file_obj.get('status') != 'released' or args.force:
					patch_payload = {'status': args.status}
					if args.dryrun:
						print "--dryrun:  would have patched %s" %(json.dumps(patch_payload))
					else:
						r = requests.patch(url, auth=keypair, data=json.dumps(patch_payload), headers={'content-type': 'application/json', 'accept': 'application/json'})
						try:
							r.raise_for_status()
						except:
							print(r.text)
							print('Patch failed: %s %s ... skipping' % (r.status_code, r.reason))
							continue
						else:
							print "Patched %s" %(json.dumps(patch_payload))
def main(reads1, reads2, crop_length, reference_tar,
         bwa_aln_params, bwa_version, samtools_version,
         keyfile, debug, key=None):

    # reads1 and reads2 are expected to be an arrays of file identifiers
    # indentifiers can be DNAnexus files or ENCODE file accession numbers
    # For SE, reads2 is empty
    # For PE, len(reads1) = len(reads2)
    # Multiple PE pairs or SE files are just catted before mapping
    # Error on mixed SE/PE - although this can be implemented as just a
    # "" entry at that position in reads2 array
    # TODO: Add option to down-sample mixed PE/SE to SE

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # fetch the credentials from the DCC Credentials project
    dxpy.download_folder(
        DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER)

    if not key or key in ['www', 'submit', 'production']:
        key = dxpy.api.system_whoami()['id']
    elif key == 'test':
        key = dxpy.api.system_whoami()['id'] + "-test"

    key_tuple = common.processkey(key, keyfile)
    assert key_tuple, "ERROR: Key %s is not found in the keyfile %s" % (key, keyfile)
    authid, authpw, server = key_tuple
    keypair = (authid, authpw)

    logger.info("reads1: %s" % (reads1))
    logger.info("reads2: %s" % (reads2))

    if reads2:
        paired_end = True
        assert len(reads1) == len(reads2), "Paired-end and unequal numbers of read1 and read2 identifiers: %s %s" % (reads1, reads2)
    else:
        paired_end = False

    reads1_files = [resolve_file(read, server, keypair) for read in reads1]

    if paired_end:
        reads2_files = [resolve_file(read, server, keypair) for read in reads2]
    else:
        reads2_files = []

    # pooling multiple fastqs
    if len(reads1_files) > 1:
        reads1_file = pooled(reads1_files)
    else:
        reads1_file = reads1_files[0]

    if len(reads2_files) > 1:
        reads2_file = pooled(reads2_files)
    elif len(reads2_files) == 1:
        reads2_file = reads2_files[0]
    else:
        reads2_file = None

    reference_tar_file = resolve_file(reference_tar, server, keypair)

    logger.info('Resolved reads1 to %s', reads1_file)
    if reads2_file:
        logger.info('Resolved reads2 to %s', reads2_file)
    logger.info('Resolved reference_tar to %s', reference_tar_file)

    output = {
        "reads1": reads1_file,
        "reference_tar": reference_tar_file,
        "crop_length": crop_length,
        "bwa_aln_params": bwa_aln_params,
        "bwa_version": bwa_version,
        "samtools_version": samtools_version,
        "debug": debug
    }
    if reads2_file:
        output.update({"reads2": reads2_file})

    logger.info('Exiting with output: %s' % (output))

    return output
示例#14
0
def main():

    args = get_args()

    if args.debug:
        logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
        logger.setLevel(logging.DEBUG)
    else:  # use the defaulf logging level
        logging.basicConfig(format='%(levelname)s:%(message)s')
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.experiments:
        ids = args.experiments
    # elif args.created_after:
    #   analyses = []
    #   for state in args.state:
    #       analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)))
    #   ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')]
    elif args.all:
        exp_query = \
            "/search/?type=Experiment" + \
            "&assay_title=ChIP-seq" + \
            "&award.project=ENCODE" + \
            "&status=released&status=submitted&status=in+progress&status=started&status=release+ready"
        all_experiments = common.encoded_get(server+exp_query, keypair)['@graph']
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        #never reached because inile defaults to stdin
        raise InputError("Must supply experiment id's in arguments or --infile")

    fieldnames = [  'date','analysis','analysis id','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly',
                    'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test',
                    'state','release','total price','notes']
    writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"')
    writer.writeheader()

    idr_query = \
        "/search/?type=File" + \
        "&assembly=%s" % (args.assembly) + \
        "&file_format=bed" + \
        "&output_type=optimal+idr+thresholded+peaks" + \
        "&output_type=conservative+idr+thresholded+peaks" + \
        "&lab.title=ENCODE+Processing+Pipeline" + \
        "&lab.title=J.+Michael+Cherry,+Stanford" + \
        "&status=in+progress&status=released&status=uploading&status=uploaded"
    all_idr_files = common.encoded_get(server+idr_query, keypair)['@graph']

    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs):
            if not args.all:
                logger.warning(
                    "%s: Found %d IDR step runs.  Skipping"
                    % (experiment_id, len(idr_step_runs)))
            continue

        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error(
                    '%s: Expected one IDR quality metric for file %s. Found %d.'
                    % (experiment_id, f.get('accession'), len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error(
                    '%s: File %s has no assembly'
                    % (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error(
                '%s: Expected one unique IDR metric, found %d. Skipping.'
                % (experiment_id, len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error(
                '%s: Expected one unique assembly, found %d. Skipping.'
                % (experiment_id, len(assemblies)))
            continue
        assembly = next(iter(assemblies))

        idr_step_run_uri = next(iter(idr_step_runs))
        idr_step_run = common.encoded_get(server+idr_step_run_uri, keypair)
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get('dx_job_id')
        except:
            logger.warning("Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id")
            logger.debug(idr_step_run)
            dx_job_id_str = None #could try to pull it from alias
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        dx_job = dxpy.DXJob(dx_job_id)
        job_desc = dx_job.describe()
        analysis_id = job_desc.get('analysis')

        logger.debug('%s' %(analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get('project')

        m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name'])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc['name']))
            continue

        if args.all:  # we've already gotten all the experiment objects
            experiment = \
                next(e for e in all_experiments
                     if e['accession'] == experiment_accession)
        else:
            experiment = \
                common.encoded_get(urlparse.urljoin(
                    server,
                    '/experiments/%s' % (experiment_accession)), keypair)
        logger.debug('ENCODEd experiment %s' % (experiment['accession']))
        if args.lab and experiment['lab'].split('/')[2] not in args.lab:
            continue



        try:
            idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls")
        except:
            logger.error('Failed to find final IDR stage in %s' %(analysis_id))
        else:
            if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates']
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage['state'] == 'failed':
                        try:
                            job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT)
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [r'Peak files must contain at least 20 peaks post-merge']
                            for p in patterns:
                                m = re.search(p,job_log)
                                if m:
                                    notes.append("%s: %s" %(stage_name,m.group(0)))
                        if not notes:
                            notes.append(idr_stage['failureMessage'])
                try:
                    done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed")
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage['output'].get('Np')
                N1 = idr_stage['output'].get('N1')
                N2 = idr_stage['output'].get('N2')
                Nt = idr_stage['output'].get('Nt')
                rescue_ratio = idr_stage['output'].get('rescue_ratio')
                self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio')
                reproducibility_test = idr_stage['output'].get('reproducibility_test')
                notes = "IDR Complete"
                try:
                    done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done")
                except StopIteration:
                    done_time = None
                except:
                    raise

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000))
        else:
            date = "Running"
        analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        experiment_link = '%sexperiments/%s' %(server, experiment.get('accession'))
        row = {
            'date': date,
            'analysis':     analysis_link,
            'analysis id':  desc.get('id'),
            'experiment':   experiment_link,
            'target':       experiment['target'].split('/')[2],
            'biosample_term_name':  experiment.get('biosample_term_name'),
            'biosample_type':   experiment.get('biosample_type'),
            'lab':          experiment['lab'].split('/')[2],
            'rfa':          common.encoded_get(server+experiment.get('award'),keypair).get('rfa'),
            'assembly':     assembly,
            'Np':           Np,
            'N1':           N1,
            'N2':           N2,
            'Nt':           Nt,
            'rescue_ratio': rescue_ratio,
            'self_consistency_ratio': self_consistency_ratio,
            'reproducibility_test': reproducibility_test,
            'state':        desc.get('state'),
            'release':      experiment['status'],
            'total price':  desc.get('totalPrice')
        }

        if notes:
            row.update({'notes': '%s' %(notes)})
        else:
            row.update({'notes': '%s' %('OK')})
        #log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)
示例#15
0
def main():
	global args
	args = get_args()

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.experiments:
		exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments])))
	else:
		exp_ids = csv.reader(args.infile)

	for instring in exp_ids:
		exp_id = instring[0].strip()
		if len(instring) > 1:
			repns = []
			for s in instring[1:]:
				repns.extend(s.split(','))
			biorep_ns = list(set([int(s) for s in repns]))
		else:
			biorep_ns = []
		outstrings = []
		encode_url = urlparse.urljoin(server,exp_id)
		experiment = common.encoded_get(encode_url, keypair)
		outstrings.append(exp_id)
		files = files_to_map(experiment, server, keypair, args.sfn_dupes)
		outstrings.append(str(len(files)))
		outstrings.append(str([f.get('accession') for f in files]))
		replicates = replicates_to_map(files, server, keypair, biorep_ns)

		if files:
			for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]):
				outstrings.append('rep%s' %(biorep_n))
				biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)]
				paired_files = []
				unpaired_files = []
				while biorep_files:
					file_object = biorep_files.pop()
					if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together
						unpaired_files.append(file_object)
					elif file_object.get('paired_end') in ['1','2']:
						if file_object.get('paired_with'):
							mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None)
						else: #have to find the file that is paired with this one
							mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None)
						if mate:
							biorep_files.remove(mate)
						else:
							logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession')))
							mate = {}
						paired_files.append((file_object,mate))
				if biorep_files:
					logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files))
				if paired_files:
					pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair)
				if unpaired_files:
					se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair)
				if paired_files and pe_jobs:
					outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files]))
					outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs]))
				else:
					outstrings.append('paired:%s' %(None))
				if unpaired_files and se_jobs:
					outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files]))
					outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs]))
				else:
					outstrings.append('unpaired:%s' %(None))

			print '\t'.join(outstrings)
		else: # no files
			if not replicates:
				logging.warning('%s: No files and no replicates' %experiment.get('accession'))
			else:
				logging.warning('%s: No files to map' %experiment.get('accession'))
		if files and not replicates:
			logging.warning('%s: Files but no replicates' %experiment.get('accession'))
示例#16
0
#!/usr/bin/env python2

import common
import pprint

DEPRECATED_STATUSES = ['deleted', 'revoked', 'replaced']

authid, authpw, server = common.processkey()
# server = "https://test.encodedcc.org"
# authid = "JQYGP4PB"
# authpw = "pfk2f3f3stivzbct"
keypair = (authid, authpw)

experiments = common.encoded_get(
    'https://www.encodeproject.org/search/?'
    'type=Experiment&'
    'award.project=ENCODE', keypair)['@graph']

print "Got %d experiments" % (len(experiments))

all_GRCh38_bams = common.encoded_get(
    'https://www.encodeproject.org/search/?'
    'type=File&'
    'file_format=bam&'
    'assembly=GRCh38', keypair)['@graph']

print "Got %d bams" % (len(all_GRCh38_bams))

assay_titles = {}

for exp in experiments:
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.query:
        r = requests.get(args.query, auth=keypair, headers={'content-type': 'application/json', 'accept': 'application/json'})
        experiments = r.json()['@graph']
        exp_ids = [e['accession'] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    logger.info('Checking %d experiments' % (len(exp_ids)))
    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        #logger.info('%s' %(exp_id))

        url = urlparse.urljoin(server, '/experiments/%s' %(exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in experiment_object.get('original_files')]
        bams = [f for f in original_files if f.get('file_format') == 'bam' and f.get('status') not in ['revoked','deleted','replaced']]
        fastqs = [f for f in original_files if f.get('file_format') == 'fastq' and f.get('status') not in ['revoked','deleted','replaced']]
        for f in fastqs:
            f['replicate'] = common.encoded_get(urlparse.urljoin(server,'%s' %(f.get('replicate'))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get('accession'),server,keypair)
            if len(bioreps) != 1:
                logger.error("Expected to find 1 biorep for bam %s, found %s.  Skipping." %(bam.get('accession'), bioreps))
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [common.encoded_get(urlparse.urljoin(server,'%s' %(uri)), keypair) for uri in bam.get('derived_from')]
            except:
                derived_from = None
            if not derived_from:
                logger.error('bam %s is derived from nothing. Skipping' %(bam.get('accession')))
                continue
            for f in derived_from:
                if f.get('output_category') == 'reference':
                    continue
                if f.get('file_format') != 'fastq':
                    logger.error("bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files." %(bam.get('accession'), f.get('accession')))
                    continue
                try:
                    if common.after(f.get('date_created'), bam.get('date_created')):
                        logger.error("Date conflict. Bam %s is derived from newer Fastq %s" %(bam.get('accession'), f.get('accession')))
                except:
                    logger.error("Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files." %(bam.get('date_created'), f.get('date_created')))
                    continue
            for f in fastqs:
                if f.get('replicate').get('biological_replicate_number') == bam_biorep:
                    if common.after(f.get('date_created'), bam.get('date_created')):
                        logger.info("bam %s is out-of-date.  fastq %s is newer" %(bam.get('accession'), f.get('accession')))
                        if re.search('control',experiment_object.get('target').lower()):
                            logger.info("WARNING, %s is a control experiment so many other experiments may be out-of-date." %(experiment_object.get('accession')))
示例#18
0
def main():
    args = get_args()
    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    experiments = []
    if args.experiments:
        experiments.extend(args.experiments)
    if args.infile:
        with open(args.infile, 'r') as fh:
            experiments.extend([e for e in fh])

    if args.control:
        control_dxhandler = resolve_dx_file(args.control)
    else:
        control_dxhandler = None

    for exp_id in experiments:
        if exp_id.startswith('#'):
            continue
        exp_id = exp_id.rstrip()
        print("Experiment %s" % (exp_id))
        experiment_url = server + '/experiments/%s/' % (exp_id)
        experiment = common.encoded_get(experiment_url, keypair)
        if experiment.get('target'):
            target_url = server + experiment.get('target')
            target = common.encoded_get(target_url, keypair)
        else:
            logging.error('Experiment has no target ... skipping')
            continue

        print("%s %s %s" %
              (experiment['accession'], target.get('investigated_as'),
               experiment.get('description')))

        tas = get_tas(experiment, server, keypair, args.project, args.inf,
                      control_dxhandler)
        if not tas:
            logging.error('Failed to resolve all tagaligns for %s' %
                          (experiment['accession']))
            continue
        if not tas.get('rep2_ta'):
            simplicate_experiment = True
            print("Simplicate experiment ta's:")
        else:
            simplicate_experiment = False
            print("Replicated experiment ta's:")
        pprint(tas)
        # sys.exit()
        # continue

        for key, value in tas.iteritems():
            if not value:
                logging.error('Missing %s ... skipping' % (key))
                continue

        workflow_title = '%s Peaks' % (exp_id)
        if args.tag:
            workflow_title += ' %s' % (args.tag)
        outf = args.outf

        if not outf.startswith('/') and outf != '/':
            outf = '/' + outf
        if not outf.endswith('/') and outf != '/':
            outf += '/'
        outf += '%s/peaks/' % (exp_id)

        try:
            investigated_as = target['investigated_as']
        except:
            logging.error("%s: Failed to determine target type ... skipping" %
                          (exp_id))
            continue
        else:
            print(investigated_as)

        rep1_pe = tas['rep1_ta']['paired_end']
        if not simplicate_experiment:
            rep2_pe = tas['rep2_ta']['paired_end']
        else:
            rep2_pe = None

        if simplicate_experiment and rep1_pe is None:
            logging.error(
                "%s: Cannot determine paired end: rep1 PE = %s... skipping" %
                (exp_id, rep1_pe))
            continue
        elif not simplicate_experiment and None in [rep1_pe, rep2_pe]:
            logging.error(
                "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping"
                % (exp_id, rep1_pe, rep2_pe))
            continue

        if not simplicate_experiment and rep1_pe != rep2_pe:
            logging.error(
                "%s: rep1 PE %s differs from rep2 PE %s ... skipping" %
                (exp_id, rep1_pe, rep2_pe))
            continue

        if any('histone' in target_type for target_type in investigated_as):
            logging.info(
                "%s: Found to be histone.  No blacklist will be used." %
                (exp_id))
            wf_target = 'histone'
            blacklist = None
        else:
            logging.info("Assumed to be tf")
            wf_target = 'tf'
            if not args.blacklist:
                if args.assembly in ASSEMBLY_METADATA:
                    blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist']
                else:
                    logging.warning(
                        "%s: No blacklist for assembly %s, proceeding with no blacklist"
                        % (exp_id, args.assembly))
                    blacklist = None

        if not args.gsize:
            if args.assembly in ASSEMBLY_METADATA:
                genomesize = ASSEMBLY_METADATA[args.assembly]['gsize']
            else:
                logging.error("%s: Must specify -gsize for assembly %s" %
                              (exp_id, args.assembly))
        else:
            genomesize = args.gsize

        if not args.csizes:
            if args.assembly in ASSEMBLY_METADATA:
                chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes']
            else:
                logging.error("%s: Must specify -csizes for assembly %s" %
                              (exp_id, args.assembly))
        else:
            chrom_sizes = args.csizes
        chip_workflow_absolute_path = os.path.dirname(
            os.path.realpath(__file__)) + "/chip_workflow.py"
        command_strings = [
            chip_workflow_absolute_path, '--nomap --yes',
            '--target %s' % (wf_target),
            '--title "%s"' % (workflow_title),
            '--outf "%s"' % (outf),
            '--rep1pe %s' % (str(rep1_pe).lower()),
            '--rep1 %s' % (tas['rep1_ta'].get('file_id')),
            '--ctl1 %s' % (tas['rep1_ta'].get('control_id')),
            '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes),
            '--spp_version %s' % (args.spp_version)
        ]

        if not simplicate_experiment:
            command_strings.extend([
                '--rep2pe %s' % (str(rep2_pe).lower()),
                '--rep2 %s' % (tas['rep2_ta'].get('file_id')),
                '--ctl2 %s' % (tas['rep2_ta'].get('control_id')),
            ])

        if args.fragment_length:
            command_strings.append('--fragment_length %s' %
                                   str(args.fragment_length))

        if blacklist:
            command_strings.append('--blacklist "%s"' % (blacklist))
        if args.debug:
            command_strings.append('--debug')
        if args.use_existing_folders:
            command_strings.append('--use_existing_folders')
        if args.accession:
            command_strings.append('--accession')
            if args.fqcheck is not None:
                command_strings.append('--fqcheck=%s' % (args.fqcheck))
            if args.skip_control is not None:
                command_strings.append('--skip_control=%s' %
                                       (args.skip_control))
            if args.force_patch is not None:
                command_strings.append('--force_patch=%s' % (args.force_patch))
        run_command = ' '.join(command_strings)
        print(run_command)

        if args.dryrun:
            logging.info('Dryrun')
        else:
            try:
                subprocess.check_call(run_command, shell=True)
            except subprocess.CalledProcessError as e:
                logging.error(
                    "%s: chip_workflow exited with non-zero code %d" %
                    (exp_id, e.returncode))
            else:
                print("%s workflow created" % (experiment['accession']))
                logging.debug("%s: patching internal_status to url %s" %
                              (exp_id, experiment_url))
                r = common.encoded_patch(experiment_url,
                                         keypair,
                                         {'internal_status': 'processing'},
                                         return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.warning(
                        "%s: Failed to update experiment internal_status to processing. Skipping that update."
                        % (exp_id))
                    logging.debug(r.text)
#!/usr/bin/env python2

import common
import pprint

DEPRECATED_STATUSES = ['deleted', 'revoked', 'replaced']

authid, authpw, server = common.processkey()
# server = "https://test.encodedcc.org"
# authid = "JQYGP4PB"
# authpw = "pfk2f3f3stivzbct"
keypair = (authid, authpw)

experiments = common.encoded_get(
    'https://www.encodeproject.org/search/?'
    'type=Experiment&'
    'award.project=ENCODE',
    keypair)['@graph']

print "Got %d experiments" % (len(experiments))

all_GRCh38_bams = common.encoded_get(
    'https://www.encodeproject.org/search/?'
    'type=File&'
    'file_format=bam&'
    'assembly=GRCh38',
    keypair)['@graph']

print "Got %d bams" % (len(all_GRCh38_bams))

assay_titles = {}
def main():
    args = get_args()
    if args.debug:
        logging.basicConfig(format='%(levelname)s:%(message)s',
                            level=logging.DEBUG)
        logger.setLevel(logging.DEBUG)
    else:
        # Use the default logging level.
        logging.basicConfig(format='%(levelname)s:%(message)s')
        logger.setLevel(logging.INFO)
    if args.released:
        keypair = None
        server = PUBLIC_SERVER
    else:
        authid, authpw, server = common.processkey(args.key, args.keyfile)
        keypair = (authid, authpw)
    if args.experiments:
        ids = args.experiments
    elif args.all:
        # Get metadata for all ChIP-seq Experiments.
        base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released'
        extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready'
        exp_query = base_exp_query if args.released else (base_exp_query +
                                                          extended_query)
        all_experiments = common.encoded_get(server + exp_query,
                                             keypair)['@graph']
        # Extract Experiment accessions.
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        # Never reached because infile defaults to stdin.
        raise InputError('Must supply experiment ids'
                         ' in arguments or --infile.')
    # Define column names for TSV.
    fieldnames = [
        'date', 'analysis', 'analysis_id', 'experiment', 'target',
        'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa',
        'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio',
        'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1',
        'F2', 'state', 'release', 'total_price', 'quality_metric_of'
    ]
    if args.create_google_sheet:
        # Force creation of temporary CSV that can be loaded into a DataFrame,
        # written to Google Sheets, then deleted.
        temp_file = 'temp_idr_%s.tsv' % (args.assembly)
        args.outfile = open(temp_file, 'w')
    writer = csv.DictWriter(args.outfile,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()
    # Get metadata for all IDR output Files.
    base_idr_query = ('/search/?type=File&assembly=%s&file_format=bed'
                      '&output_type=optimal+idr+thresholded+peaks'
                      '&output_type=conservative+idr+thresholded+peaks'
                      '&output_type=pseudoreplicated+idr+thresholded+peaks'
                      '&lab.title=ENCODE+Processing+Pipeline'
                      '&lab.title=J.+Michael+Cherry,+Stanford'
                      '&status=released' % (args.assembly))
    extended_idr_query = '&status=in+progress&status=uploading&status=uploaded'
    idr_query = base_idr_query if args.released else (base_idr_query +
                                                      extended_idr_query)
    all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph']
    na = 'not_available'
    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs):
            if not args.all:
                logger.warning("%s: Found %d IDR step runs. Skipping" %
                               (experiment_id, len(idr_step_runs)))
            continue
        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error(
                    '%s: Expected one IDR quality metric for file %s.'
                    ' Found %d.' %
                    (experiment_id, f.get('accession'), len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error('%s: File %s has no assembly' %
                             (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error('%s: Expected one unique IDR metric,'
                         ' found %d. Skipping.' %
                         (experiment_id, len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error('%s: Expected one unique assembly, found %d.'
                         ' Skipping.' % (experiment_id, len(assemblies)))
            continue
        # Grab unique value from set.
        idr_qc_uri = next(iter(idr_qc_uris))
        assembly = next(iter(assemblies))
        # Get analysis_id from DNAnexus, create analysis_link.
        idr_step_run_uri = next(iter(idr_step_runs))
        try:
            idr_step_run = common.encoded_get(server + idr_step_run_uri,
                                              keypair)
        except Exception as e:
            print(experiment_id, e, 'Skipping.')
            continue
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get(
                'dx_job_id')
        except:
            logger.warning(
                "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id"
            )
            logger.debug(idr_step_run)
            # Could try to pull it from alias.
            dx_job_id_str = None
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        if not args.released:
            dx_job = dxpy.DXJob(dx_job_id)
            job_desc = dx_job.describe()
            analysis_id = job_desc.get('analysis')
            logger.debug('%s' % (analysis_id))
            analysis = dxpy.DXAnalysis(analysis_id)
            desc = analysis.describe()
            project = desc.get('project')
            analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
                desc.get('project').split('-')[1],
                desc.get('id').split('-')[1])
        else:
            analysis_link = na
            desc = {}

        # Get IDR object.
        idr = common.encoded_get(server + idr_qc_uri, keypair)
        # Pull metrics of interest.
        idr_status = idr.get('status', na)
        if (args.released and (idr_status == na or idr_status != 'released')):
            logger.error('%s: Expected released IDR metric. Skipping.' %
                         idr_qc_uris)
            continue
        Np = idr.get('Np', na)
        N1 = idr.get('N1', na)
        N2 = idr.get('N2', na)
        Nt = idr.get('Nt', na)
        Fp = idr.get('Fp', na)
        F1 = idr.get('F1', na)
        F2 = idr.get('F2', na)
        Ft = idr.get('Ft', na)
        quality_metric_of = idr.get('quality_metric_of', [])
        date = idr.get('date_created', na)
        rescue_ratio = idr.get('rescue_ratio', na)
        self_consistency_ratio = idr.get('self_consistency_ratio', na)
        reproducibility_test = idr.get('reproducibility_test', na)
        # Get Experiment object.
        experiment = common.encoded_get(server + experiment_id, keypair)
        experiment_link = '%sexperiments/%s' % (server,
                                                experiment.get('accession'))
        # Get Award object.
        award = common.encoded_get(server + experiment.get('award'), keypair)
        # Grab project phase, e.g. ENCODE4.
        rfa = award.get('rfa', na)
        row = {
            'date': date,
            'analysis': analysis_link,
            'analysis_id': desc.get('id', na),
            'experiment': experiment_link,
            'target': experiment['target'].split('/')[2],
            'biosample_term_name': experiment.get('biosample_term_name'),
            'biosample_type': experiment.get('biosample_type'),
            'replication': experiment.get('replication_type'),
            'lab': experiment['lab'].split('/')[2],
            'rfa': rfa,
            'assembly': assembly,
            'Nt': Nt,
            'Np': Np,
            'N1': N1,
            'N2': N2,
            'rescue_ratio': rescue_ratio,
            'self_consistency_ratio': self_consistency_ratio,
            'reproducibility_test': reproducibility_test,
            'Ft': Ft,
            'Fp': Fp,
            'F1': F1,
            'F2': F2,
            'state': desc.get('state', na),
            'release': experiment['status'],
            'total_price': desc.get('totalPrice', na),
            'quality_metric_of': ', '.join(quality_metric_of)
        }
        writer.writerow(row)
    if args.create_google_sheet:
        args.outfile.close()
        # Load CSV data, sort.
        idr_data = pd.read_table(temp_file)
        idr_data = idr_data.replace('not_available', '')
        idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x))
        idr_data = idr_data.sort_values(
            by=['lab', 'biosample_term_name', 'target', 'experiment'],
            ascending=[True, True, True, True])
        idr_data.date = idr_data.date.astype('str')
        idr_data = idr_data.reset_index(drop=True)
        # Read sheet title and create unique page title.
        date = datetime.now().strftime('%m_%d_%Y')
        sheet_title = (args.sheet_title if not args.released else
                       '{} Released'.format(args.sheet_title))
        page_title = '%s_IDR_FRIP_%s' % (args.assembly, date)
        # Open/create Google Sheet.
        gc = pygsheets.authorize(args.apikey)
        try:
            sh = gc.open(sheet_title)
        except pygsheets.exceptions.SpreadsheetNotFound:
            sh = gc.create(sheet_title)
        try:
            wks = sh.add_worksheet(page_title)
        except HttpError:
            wks = sh.worksheet_by_title(page_title)
        # Clear worksheet.
        wks.clear()
        # Add data from DataFrame.
        wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1')
        # Apply formatting and conditions.
        header['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, header)
        # Format numbers.
        for col in number_format_columns:
            num = idr_data.columns.get_loc(col)
            number_format['repeatCell']['range']['startColumnIndex'] = num
            number_format['repeatCell']['range']['endColumnIndex'] = num + 1
            number_format['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, number_format)
        # Resize font.
        font_size_format['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format)
        # Add conditional formatting.
        for conditional in conditions:
            num = idr_data.columns.get_loc("reproducibility_test")
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'startColumnIndex'] = num
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'endColumnIndex'] = num + 1
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, conditional)
        for k, v in notes_dict.items():
            num = idr_data.columns.get_loc(k)
            note['repeatCell']['range']['startColumnIndex'] = num
            note['repeatCell']['range']['endColumnIndex'] = num + 1
            note['repeatCell']['cell']['note'] = v
            note['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, note)
        # Optional. Smaller column width to match original.
        for i in range(wks.cols):
            wks.adjust_column_width(i, pixel_size=38)
        # Resize tiny columns.
        tiny_columns = ['experiment', 'analysis']
        for i in [idr_data.columns.get_loc(x) for x in tiny_columns]:
            wks.adjust_column_width(i, pixel_size=25)
        # Resize medium columns.
        medium_columns = ['replication', 'assembly', 'rfa']
        for i in [idr_data.columns.get_loc(x) for x in medium_columns]:
            wks.adjust_column_width(i, pixel_size=65)
        # Resize wide columns.
        wide_columns = ['target', 'reproducibility_test', 'lab']
        for i in [idr_data.columns.get_loc(x) for x in wide_columns]:
            wks.adjust_column_width(i, pixel_size=85)
        # Remove temp file.
        os.remove(temp_file)
示例#21
0
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    if args.experiments:
        exp_ids = csv.reader(StringIO.StringIO('\n'.join([s.rstrip() for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for instring in exp_ids:
        exp_id = instring[0].strip()
        if len(instring) > 1:
            repns = []
            for s in instring[1:]:
                repns.extend(s.split(','))
            biorep_ns = list(set([int(s) for s in repns]))
        else:
            biorep_ns = []
        outstrings = []
        encode_url = urlparse.urljoin(server,exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.no_sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, biorep_ns)
        in_process = False
        if files:
            for biorep_n in set([rep.get('biological_replicate_number') for rep in replicates]):
                outstrings.append('rep%s' %(biorep_n))
                biorep_files = [f for f in files if biorep_n in common.biorep_ns(f,server,keypair)]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get('paired_end') == None: # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1','2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get('@id') == file_object.get('paired_with')), None)
                        else: #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get('paired_with') == file_object.get('@id')), None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %(experiment.get('accession'), file_object.get('accession')))
                            mate = {}

                        # if mapping as SE, ignore the mate and just map the
                        # rep1 as SE with all the other SE for this rep, if any
                        if args.force_se:
                            unpaired_files.append(next(
                                f for f in [file_object, mate]
                                if f.get('paired_end') == '1'))
                        else:
                            paired_files.append((file_object, mate))

                if biorep_files:
                    logging.warning('%s: leftover file(s) %s' %(experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = map_only(experiment, biorep_n, paired_files, args.key, server, keypair, args.sex_specific)
                    in_process = True
                if unpaired_files:
                    se_jobs = map_only(experiment, biorep_n, unpaired_files, args.key, server, keypair, args.sex_specific)
                    in_process = True
                if paired_files and pe_jobs:
                    outstrings.append('paired:%s' %([(a.get('accession'), b.get('accession')) for (a,b) in paired_files]))
                    outstrings.append('paired jobs:%s' %([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' %(None))
                if unpaired_files and se_jobs:
                    outstrings.append('unpaired:%s' %([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' %(None))
            if in_process:
                r = common.encoded_patch(encode_url, keypair, {"internal_status": "processing"}, return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error("Tried and failed to set internal_status")
                    logging.error(r.text)
            print '\t'.join(outstrings)
        else: # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %experiment.get('accession'))
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.analysis_ids:
        ids = args.analysis_ids
    else:
        ids = args.infile

    formats = ['bed_narrowPeak', 'bed_gappedPeak']
    fieldnames = [
        'file', 'analysis', 'experiment', 'replicates', 'output_name',
        'file_format', 'output_type', 'target', 'biosample_term_name',
        'biosample_term_id', 'biosample_type', 'biosample_life_stage',
        'biosample_age', 'biosample_organism'
    ]
    writer = csv.DictWriter(args.outfile, fieldnames, delimiter='\t')
    writer.writeheader()
    for (i, analysis_id) in enumerate(ids):
        analysis_id = analysis_id.rstrip()
        logger.info('%s' % (analysis_id))
        try:
            files = analysis_files(analysis_id, keypair, server, args.assembly)
        except:
            logger.error(
                '%s error finding analysis_files.  Check experiment metadata.'
                % (analysis_id))
        for f in [
                f_obj for f_obj in files if f_obj.get('file_format') in formats
        ]:
            fid = f['dx'].get_id()
            local_path = os.path.join(args.outdir, fid)
            if not os.path.isfile(local_path):
                if not os.path.exists(args.outdir):
                    os.makedirs(args.outdir)
                dxpy.download_dxfile(fid, local_path)
            replicates = []
            for derived_from in f['derived_from']:
                rep_ns = common.biorep_ns(derived_from, server, keypair)
                for r in rep_ns:
                    replicates.append(r)
            experiment = common.encoded_get(
                urlparse.urljoin(server, '/experiments/%s' % (f['dataset'])),
                keypair)
            rep = common.encoded_get(
                urlparse.urljoin(server, experiment['replicates'][0]), keypair)
            lib = common.encoded_get(urlparse.urljoin(server, rep['library']),
                                     keypair)
            biosample = common.encoded_get(
                urlparse.urljoin(server, lib['biosample']), keypair)
            writer.writerow({
                'file':
                fid,
                'analysis':
                analysis_id,
                'experiment':
                experiment.get('accession'),
                'replicates':
                replicates,
                'output_name':
                f.get('name'),
                'file_format':
                f.get('file_format'),
                'output_type':
                f.get('output_type'),
                'target':
                experiment.get('target'),
                'biosample_term_name':
                experiment.get('biosample_term_name'),
                'biosample_term_id':
                experiment.get('biosample_term_id'),
                'biosample_type':
                experiment.get('biosample_type'),
                'biosample_life_stage':
                biosample.get('life_stage'),
                'biosample_age':
                biosample.get('age'),
                'biosample_organism':
                biosample.get('organism')
            })
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.query:
        r = requests.get(
            args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"}
        )
        experiments = r.json()["@graph"]
        exp_ids = [e["accession"] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        logger.info("%s" % (exp_id))
        url = urlparse.urljoin(server, "/experiments/%s" % (exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [
            common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair)
            for uri in experiment_object.get("original_files")
        ]
        bams = [
            f
            for f in original_files
            if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        fastqs = [
            f
            for f in original_files
            if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        beds = [
            f
            for f in original_files
            if f.get("file_format") == "bed" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        bigBeds = [
            f
            for f in original_files
            if f.get("file_format") == "bigBed" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        for f in beds + bigBeds:
            notes = json.loads(f.get("notes"))
            f["job"] = dxpy.describe(notes["dx-createdBy"]["job"])
            job = dxpy.describe(notes["dx-createdBy"]["job"])
            output_names = [
                output_name
                for output_name, value in job["output"].iteritems()
                if dxpy.is_dxlink(value) and value["$dnanexus_link"] == notes["dx-id"]
            ]
            assert len(output_names) == 1
            f["output_name"] = output_names[0]
            f["dxid"] = notes["dx-id"]
        for bb in bigBeds:
            print bb["accession"]
            notes = json.loads(bb.get("notes"))
            job = dxpy.describe(notes["dx-createdBy"]["job"])
            output_name = bb["output_name"]
            assert output_name.endswith("_bb")
            print output_name
            bed_output_name = output_name.rpartition("_bb")[0]
            print bed_output_name
            bed_dxid = job["output"][bed_output_name]["$dnanexus_link"]
            print bed_dxid
            possible_beds = [
                bed["accession"] for bed in beds if bed.get("notes") and json.loads(bed["notes"])["dx-id"] == bed_dxid
            ]
            print possible_beds
            assert len(possible_beds) == 1
            print possible_beds[0]
            if not args.dryrun:
                url = urlparse.urljoin(server, "/files/%s/" % (bb["accession"]))
                payload = {"derived_from": [possible_beds[0]]}
                print url
                print payload
                r = requests.patch(
                    url,
                    auth=keypair,
                    data=json.dumps(payload),
                    headers={"content-type": "application/json", "accept": "application/json"},
                )
                try:
                    r.raise_for_status()
                except:
                    print r.text
        overlapping_peaks_beds = [b for b in beds if b.get("output_name") == "overlapping_peaks"]
        assert len(overlapping_peaks_beds) == 1
        overlapping_peaks_bed = overlapping_peaks_beds[0]
        job = overlapping_peaks_bed["job"]
        derived_from_dxids = [
            job["input"][input_name]["$dnanexus_link"]
            for input_name in job["input"].keys()
            if input_name in ["rep1_peaks", "rep2_peaks", "pooled_peaks"]
        ]
        print derived_from_dxids
        derived_from_accessions = [bed["accession"] for bed in beds if bed["dxid"] in derived_from_dxids]
        print derived_from_accessions
        if not args.dryrun:
            url = urlparse.urljoin(server, "/files/%s/" % (overlapping_peaks_bed["accession"]))
            payload = {"derived_from": derived_from_accessions}
            print url
            print payload
            r = requests.patch(
                url,
                auth=keypair,
                data=json.dumps(payload),
                headers={"content-type": "application/json", "accept": "application/json"},
            )
            try:
                r.raise_for_status()
            except:
                print r.text
def main():
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)
    project = resolve_project(args.project)
    SRR_files = dxpy.find_data_objects(name="SRR???????_?.fastq.gz",
                                       name_mode='glob',
                                       classname='file',
                                       recurse=True,
                                       return_handler=True,
                                       folder=args.folder,
                                       project=args.project)
    for srr_dxfile in SRR_files:
        m = re.search('(SRR.{7})_(\d)', srr_dxfile.name)
        if m:
            srr_basename = m.group(1)
            end_num = m.group(2)
        else:
            assert m
        srr_encfiles = common.encoded_get(
            '/'.join([
                server,
                'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked'
                % (srr_basename)
            ]), keypair)['@graph']
        if not srr_encfiles:
            logging.error('%s object not found at ENCODE.  Skipping.' %
                          (srr_basename))
            continue
        elif len(srr_encfiles) > 1:
            logging.error(
                '%s multiple matching objects found at ENCODE.  Skipping.' %
                (srr_basename))
            continue
        else:
            srr_encfile = srr_encfiles[0]
        # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair)
        # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair)
        # biorep_n = replicate.get('biological_replicate_number')
        all_fastqs = common.encoded_get(
            '/'.join([
                server,
                'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced'
                % (srr_basename)
            ]), keypair)['@graph']
        if not all_fastqs:
            print("%s: no fastq(s) found.  Skipping." % (srr_dxfile.name))
            continue
        if end_num == '1':
            fastqs = [
                f for f in all_fastqs if f.get('run_type') == 'single-ended'
                or f.get('paired_end') == end_num
            ]
        elif end_num in ['2', '3']:
            fastqs = [
                f for f in all_fastqs if f.get('run_type') == 'paired-ended'
                and f.get('paired_end') == '2'
            ]
        if not fastqs:
            print("%s: no fastq(s) found for paired_end %s.  Skipping" %
                  (srr_basename, end_num))
            continue
        elif len(fastqs) > 1:
            print("%s: ambiguous matches to %s.  Skipping" %
                  (srr_basename, [f.get('accession') for f in fastqs]))
            continue
        else:
            fastq = fastqs[0]
            newname = '%s.fastq.gz' % (fastq.get('accession'))
            if args.dry_run:
                print('dry_run: Could rename %s to %s' %
                      (srr_dxfile.name, newname))
            else:
                srr_dxfile.set_properties({'srr_filename': srr_dxfile.name})
                srr_dxfile.rename(newname)
                print('%s renamed to %s' % (srr_dxfile.name, newname))
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        ids = args.experiments
    # elif args.created_after:
    #   analyses = []
    #   for state in args.state:
    #       analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)))
    #   ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')]
    elif args.all:
        exp_query = \
            "/search/?type=Experiment" + \
            "&assay_title=ChIP-seq" + \
            "&award.project=ENCODE" + \
            "&status=released&status=submitted&status=in+progress&status=started&status=release+ready"
        all_experiments = common.encoded_get(server + exp_query,
                                             keypair)['@graph']
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        #never reached because inile defaults to stdin
        raise InputError(
            "Must supply experiment id's in arguments or --infile")

    fieldnames = [
        'date', 'analysis', 'analysis id', 'experiment', 'target',
        'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly',
        'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio',
        'reproducibility_test', 'state', 'release', 'total price', 'notes'
    ]
    writer = csv.DictWriter(sys.stdout,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()

    idr_query = \
        "/search/?type=File" + \
        "&file_format=bed" + \
        "&output_type=optimal+idr+thresholded+peaks" + \
        "&output_type=conservative+idr+thresholded+peaks" + \
        "&lab.title=ENCODE+Processing+Pipeline" + \
        "&lab.title=J.+Michael+Cherry,+Stanford" + \
        "&status=in+progress&status=released&status=uploading&status=uploaded"
    all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph']

    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs) == 1:
            if not args.all:
                logger.warning(
                    "%s: Expected one IDR step run. Found %d.  Skipping" %
                    (experiment_id, len(idr_step_runs)))
            continue

        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error(
                    '%s: Expected one IDR quality metric for file %s. Found %d.'
                    %
                    (experiment_id, f.get('accession'), len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error('%s: File %s has no assembly' %
                             (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error(
                '%s: Expected one unique IDR metric, found %d. Skipping.' %
                (experiment_id, len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error(
                '%s: Expected one unique assembly, found %d. Skipping.' %
                (experiment_id, len(assemblies)))
            continue
        assembly = next(iter(assemblies))

        idr_step_run_uri = next(iter(idr_step_runs))
        idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair)
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get(
                'dx_job_id')
        except:
            logger.warning(
                "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id"
            )
            logger.debug(idr_step_run)
            dx_job_id_str = None  #could try to pull it from alias
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        dx_job = dxpy.DXJob(dx_job_id)
        job_desc = dx_job.describe()
        analysis_id = job_desc.get('analysis')

        logger.debug('%s' % (analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get('project')

        m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name'])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc['name']))
            continue

        if args.all:  # we've already gotten all the experiment objects
            experiment = \
                next(e for e in all_experiments
                     if e['accession'] == experiment_accession)
        else:
            experiment = \
                common.encoded_get(urlparse.urljoin(
                    server,
                    '/experiments/%s' % (experiment_accession)), keypair)
        logger.debug('ENCODEd experiment %s' % (experiment['accession']))
        if args.lab and experiment['lab'].split('/')[2] not in args.lab:
            continue

        try:
            idr_stage = next(
                s['execution'] for s in desc['stages']
                if s['execution']['name'] == "Final IDR peak calls")
        except:
            logging.error('Failed to find final IDR stage in %s' %
                          (analysis_id))
        else:
            if idr_stage[
                    'state'] != 'done':  #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = [
                    'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates',
                    'IDR Rep 2 Self-pseudoreplicates',
                    'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'
                ]
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(
                            s['execution'] for s in desc['stages']
                            if s['execution']['name'] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage['state'] == 'failed':
                        try:
                            job_log = subprocess.check_output(
                                'dx watch %s' % (idr_stage['id']),
                                shell=True,
                                stderr=subprocess.STDOUT)
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [
                                r'Peak files must contain at least 20 peaks post-merge'
                            ]
                            for p in patterns:
                                m = re.search(p, job_log)
                                if m:
                                    notes.append("%s: %s" %
                                                 (stage_name, m.group(0)))
                        if not notes:
                            notes.append(idr_stage['failureMessage'])
                try:
                    done_time = next(transition['setAt']
                                     for transition in desc['stateTransitions']
                                     if transition['newState'] == "failed")
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage['output'].get('Np')
                N1 = idr_stage['output'].get('N1')
                N2 = idr_stage['output'].get('N2')
                Nt = idr_stage['output'].get('Nt')
                rescue_ratio = idr_stage['output'].get('rescue_ratio')
                self_consistency_ratio = idr_stage['output'].get(
                    'self_consistency_ratio')
                reproducibility_test = idr_stage['output'].get(
                    'reproducibility_test')
                notes = "IDR Complete"
                done_time = next(transition['setAt']
                                 for transition in desc['stateTransitions']
                                 if transition['newState'] == "done")

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime(done_time / 1000))
        else:
            date = "Running"
        analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
            desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        experiment_link = 'https://www.encodeproject.org/experiments/%s' % (
            experiment.get('accession'))
        row = {
            'date':
            date,
            'analysis':
            analysis_link,
            'analysis id':
            desc.get('id'),
            'experiment':
            experiment_link,
            'target':
            experiment['target'].split('/')[2],
            'biosample_term_name':
            experiment.get('biosample_term_name'),
            'biosample_type':
            experiment.get('biosample_type'),
            'lab':
            experiment['lab'].split('/')[2],
            'rfa':
            common.encoded_get(server + experiment.get('award'),
                               keypair).get('rfa'),
            'assembly':
            assembly,
            'Np':
            Np,
            'N1':
            N1,
            'N2':
            N2,
            'Nt':
            Nt,
            'rescue_ratio':
            rescue_ratio,
            'self_consistency_ratio':
            self_consistency_ratio,
            'reproducibility_test':
            reproducibility_test,
            'state':
            desc.get('state'),
            'release':
            experiment['status'],
            'total price':
            desc.get('totalPrice')
        }

        if notes:
            row.update({'notes': '%s' % (notes)})
        else:
            row.update({'notes': '%s' % ('OK')})
        #log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)
示例#26
0
def main(reads1,
         reads2,
         crop_length,
         reference_tar,
         bwa_aln_params,
         bwa_version,
         samtools_version,
         keyfile,
         debug,
         key=None):

    # reads1 and reads2 are expected to be an arrays of file identifiers
    # indentifiers can be DNAnexus files or ENCODE file accession numbers
    # For SE, reads2 is empty
    # For PE, len(reads1) = len(reads2)
    # Multiple PE pairs or SE files are just catted before mapping
    # Error on mixed SE/PE - although this can be implemented as just a
    # "" entry at that position in reads2 array
    # TODO: Add option to down-sample mixed PE/SE to SE

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # fetch the credentials from the DCC Credentials project
    dxpy.download_folder(DCC_CREDENTIALS_PROJECT,
                         '.',
                         folder=DCC_CREDENTIALS_FOLDER)

    if not key or key in ['www', 'submit', 'production']:
        key = dxpy.api.system_whoami()['id']
    elif key == 'test':
        key = dxpy.api.system_whoami()['id'] + "-test"

    key_tuple = common.processkey(key, keyfile)
    assert key_tuple, "ERROR: Key %s is not found in the keyfile %s" % (
        key, keyfile)
    authid, authpw, server = key_tuple
    keypair = (authid, authpw)

    logger.info("reads1: %s" % (reads1))
    logger.info("reads2: %s" % (reads2))

    if reads2:
        paired_end = True
        assert len(reads1) == len(
            reads2
        ), "Paired-end and unequal numbers of read1 and read2 identifiers: %s %s" % (
            reads1, reads2)
    else:
        paired_end = False

    reads1_files = [resolve_file(read, server, keypair) for read in reads1]

    if paired_end:
        reads2_files = [resolve_file(read, server, keypair) for read in reads2]
    else:
        reads2_files = []

    # pooling multiple fastqs
    if len(reads1_files) > 1:
        reads1_file = pooled(reads1_files)
    else:
        reads1_file = reads1_files[0]

    if len(reads2_files) > 1:
        reads2_file = pooled(reads2_files)
    elif len(reads2_files) == 1:
        reads2_file = reads2_files[0]
    else:
        reads2_file = None

    reference_tar_file = resolve_file(reference_tar, server, keypair)

    logger.info('Resolved reads1 to %s', reads1_file)
    if reads2_file:
        logger.info('Resolved reads2 to %s', reads2_file)
    logger.info('Resolved reference_tar to %s', reference_tar_file)

    output = {
        "reads1": reads1_file,
        "reference_tar": reference_tar_file,
        "crop_length": crop_length,
        "bwa_aln_params": bwa_aln_params,
        "bwa_version": bwa_version,
        "samtools_version": samtools_version,
        "debug": debug
    }
    if reads2_file:
        output.update({"reads2": reads2_file})

    logger.info('Exiting with output: %s' % (output))

    return output
示例#27
0
def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.analysis_ids:
		ids = args.analysis_ids
	elif args.created_after:
		analyses = dxpy.find_analyses(name="ENCSR*",name_mode='glob',state='done',include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))
		ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq']
	elif args.infile:
		ids = args.infile
	else:
		#never reached because inile defaults to stdin
		raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after")

	fieldnames = [	'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly',
					'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test',
					'state','total price','notes']
	writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"')
	writer.writeheader()

	for (i, analysis_id) in enumerate(ids):
		if analysis_id.startswith('#'):
			continue
		analysis_id = analysis_id.rstrip()
		logger.debug('%s' %(analysis_id))
		analysis = dxpy.DXAnalysis(analysis_id)
		desc = analysis.describe()
		project = desc.get('project')

		m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name'])
		if m:
			experiment_accession = m.group(1)
		else:
			logger.error("No accession in %s, skipping." %(desc['name']))
			continue

		experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair)
		logger.debug('ENCODEd experiment %s' %(experiment['accession']))
		try:
			idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls")
		except:
			logging.error('Failed to find IDR stage in %s' %(analysis_id))
		else:
			Np = idr_stage['output'].get('Np')
			N1 = idr_stage['output'].get('N1')
			N2 = idr_stage['output'].get('N2')
			Nt = idr_stage['output'].get('Nt')
			rescue_ratio = idr_stage['output'].get('rescue_ratio')
			self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio')
			reproducibility_test = idr_stage['output'].get('reproducibility_test')

		done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done")

		row = {
			'date': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)),
			'analysis':		analysis.get_id(),
			'experiment': 	experiment.get('accession'),
			'target':		experiment['target'].split('/')[2],
			'biosample_term_name':	experiment.get('biosample_term_name'),
			'biosample_type':	experiment.get('biosample_type'),
			'lab':			experiment['lab'].split('/')[2],
			'rfa':			common.encoded_get(server+experiment.get('award'),keypair).get('rfa'),
			'assembly':		args.assembly, #TODO ... derive this from the analysis
			'Np':			Np,
			'N1':			N1,
			'N2':			N2,
			'Nt':			Nt,
			'rescue_ratio':	rescue_ratio,
			'self_consistency_ratio': self_consistency_ratio,
			'reproducibility_test': reproducibility_test,
			'state': 		desc.get('state'),
			'total price': 	desc.get('totalPrice')
		}
		notes = []

		# if int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out')) != int(np_stage.get('output').get('npeaks_rejected')):
		# 	notes.append("in-out!=rej delta=%i" %(int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out'))))
		# else:
		# 	notes.append("in-out=rej OK")

		# bb_check_notes = []
		# for stage in [np_stage, gp_stage]:
		# 	bb_dxf = dxpy.DXFile(stage['output']['overlapping_peaks_bb'])
		# 	if int(bb_dxf.describe()['size']) < 200000:
		# 		bb_check_notes.append("%s bb size=%i" %(stage['name'], int(bb_dxf.describe()['size'])))
		# if not bb_check_notes:
		# 	notes.append("bb check OK")
		# else:
		# 	notes.append(bb_check_notes)




		if notes:
			row.update({'notes': '%s' %(notes)})
		else:
			row.update({'notes': '%s' %('OK')})
		#log = subprocess.check_output('dx watch %s' %(analysis.))
		writer.writerow(row)
def main():
    args = get_args()
    if args.debug:
        logging.basicConfig(format='%(levelname)s:%(message)s',
                            level=logging.DEBUG)
        logger.setLevel(logging.DEBUG)
    else:
        # Use the default logging level.
        logging.basicConfig(format='%(levelname)s:%(message)s')
        logger.setLevel(logging.INFO)
    if args.released:
        keypair = None
        server = PUBLIC_SERVER
    else:
        authid, authpw, server = common.processkey(args.key, args.keyfile)
        keypair = (authid, authpw)
    if args.experiments:
        ids = args.experiments
    elif args.all:
        # Get metadata for all ChIP-seq Experiments.
        base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released'
        extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready'
        exp_query = base_exp_query if args.released else (base_exp_query + extended_query)
        all_experiments = common.encoded_get(server + exp_query,
                                             keypair)['@graph']
        # Extract Experiment accessions.
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        # Never reached because infile defaults to stdin.
        raise InputError('Must supply experiment ids'
                         ' in arguments or --infile.')
    # Define column names for TSV.
    fieldnames = ['date',
                  'analysis',
                  'analysis_id',
                  'experiment',
                  'target',
                  'biosample_term_name',
                  'biosample_type',
                  'replication',
                  'lab',
                  'rfa',
                  'assembly',
                  'Nt',
                  'Np',
                  'N1',
                  'N2',
                  'rescue_ratio',
                  'self_consistency_ratio',
                  'reproducibility_test',
                  'Ft',
                  'Fp',
                  'F1',
                  'F2',
                  'state',
                  'release',
                  'total_price',
                  'quality_metric_of']
    if args.create_google_sheet:
        # Force creation of temporary CSV that can be loaded into a DataFrame,
        # written to Google Sheets, then deleted.
        temp_file = 'temp_idr_%s.tsv' % (args.assembly)
        args.outfile = open(temp_file, 'w')
    writer = csv.DictWriter(args.outfile,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()
    # Get metadata for all IDR output Files.
    base_idr_query = (
        '/search/?type=File&assembly=%s&file_format=bed'
        '&output_type=optimal+idr+thresholded+peaks'
        '&output_type=conservative+idr+thresholded+peaks'
        '&output_type=pseudoreplicated+idr+thresholded+peaks'
        '&lab.title=ENCODE+Processing+Pipeline'
        '&lab.title=J.+Michael+Cherry,+Stanford'
        '&status=released' % (args.assembly)
    )
    extended_idr_query = '&status=in+progress&status=uploading&status=uploaded'
    idr_query = base_idr_query if args.released else (base_idr_query + extended_idr_query)
    all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph']
    na = 'not_available'
    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs):
            if not args.all:
                logger.warning(
                    "%s: Found %d IDR step runs. Skipping"
                    % (experiment_id, len(idr_step_runs)))
            continue
        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error('%s: Expected one IDR quality metric for file %s.'
                             ' Found %d.' % (experiment_id,
                                             f.get('accession'),
                                             len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error('%s: File %s has no assembly'
                             % (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error('%s: Expected one unique IDR metric,'
                         ' found %d. Skipping.' % (experiment_id,
                                                   len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error('%s: Expected one unique assembly, found %d.'
                         ' Skipping.' % (experiment_id, len(assemblies)))
            continue
        # Grab unique value from set.
        idr_qc_uri = next(iter(idr_qc_uris))
        assembly = next(iter(assemblies))
        # Get analysis_id from DNAnexus, create analysis_link.
        idr_step_run_uri = next(iter(idr_step_runs))
        try:
            idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair)
        except Exception as e:
            print(experiment_id, e, 'Skipping.')
            continue
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[
                0].get('dx_job_id')
        except:
            logger.warning(
                "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id")
            logger.debug(idr_step_run)
            # Could try to pull it from alias.
            dx_job_id_str = None
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        if not args.released:
            dx_job = dxpy.DXJob(dx_job_id)
            job_desc = dx_job.describe()
            analysis_id = job_desc.get('analysis')
            logger.debug('%s' % (analysis_id))
            analysis = dxpy.DXAnalysis(analysis_id)
            desc = analysis.describe()
            project = desc.get('project')
            analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
                desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        else:
            analysis_link = na
            desc = {}
            
        # Get IDR object.
        idr = common.encoded_get(server + idr_qc_uri,
                                 keypair)
        # Pull metrics of interest.
        idr_status = idr.get('status', na)
        if (args.released and (idr_status == na or idr_status != 'released')):
            logger.error('%s: Expected released IDR metric. Skipping.' % idr_qc_uris)
            continue
        Np = idr.get('Np', na)
        N1 = idr.get('N1', na)
        N2 = idr.get('N2', na)
        Nt = idr.get('Nt', na)
        Fp = idr.get('Fp', na)
        F1 = idr.get('F1', na)
        F2 = idr.get('F2', na)
        Ft = idr.get('Ft', na)
        quality_metric_of = idr.get('quality_metric_of', [])
        date = idr.get('date_created', na)
        rescue_ratio = idr.get('rescue_ratio', na)
        self_consistency_ratio = idr.get('self_consistency_ratio', na)
        reproducibility_test = idr.get('reproducibility_test', na)
        # Get Experiment object.
        experiment = common.encoded_get(server + experiment_id,
                                        keypair)
        experiment_link = '%sexperiments/%s' % (server,
                                                experiment.get('accession'))
        # Get Award object.
        award = common.encoded_get(server + experiment.get('award'), keypair)
        # Grab project phase, e.g. ENCODE4.
        rfa = award.get('rfa', na)
        row = {'date': date,
               'analysis': analysis_link,
               'analysis_id': desc.get('id', na),
               'experiment': experiment_link,
               'target': experiment['target'].split('/')[2],
               'biosample_term_name': experiment.get('biosample_term_name'),
               'biosample_type': experiment.get('biosample_type'),
               'replication': experiment.get('replication_type'),
               'lab': experiment['lab'].split('/')[2],
               'rfa': rfa,
               'assembly': assembly,
               'Nt': Nt,
               'Np': Np,
               'N1': N1,
               'N2': N2,
               'rescue_ratio': rescue_ratio,
               'self_consistency_ratio': self_consistency_ratio,
               'reproducibility_test': reproducibility_test,
               'Ft': Ft,
               'Fp': Fp,
               'F1': F1,
               'F2': F2,
               'state': desc.get('state', na),
               'release': experiment['status'],
               'total_price':  desc.get('totalPrice', na),
               'quality_metric_of': ', '.join(quality_metric_of)
               }
        writer.writerow(row)
    if args.create_google_sheet:
        args.outfile.close()
        # Load CSV data, sort.
        idr_data = pd.read_table(temp_file)
        idr_data = idr_data.replace('not_available', '')
        idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x))
        idr_data = idr_data.sort_values(
            by=['lab', 'biosample_term_name', 'target', 'experiment'],
            ascending=[True, True, True, True])
        idr_data.date = idr_data.date.astype('str')
        idr_data = idr_data.reset_index(drop=True)
        # Read sheet title and create unique page title.
        date = datetime.now().strftime('%m_%d_%Y')
        sheet_title = (
            args.sheet_title if not args.released
            else '{} Released'.format(args.sheet_title)
        )
        page_title = '%s_IDR_FRIP_%s' % (args.assembly, date)
        # Open/create Google Sheet.
        gc = pygsheets.authorize(args.apikey)
        try:
            sh = gc.open(sheet_title)
        except pygsheets.exceptions.SpreadsheetNotFound:
            sh = gc.create(sheet_title)
        try:
            wks = sh.add_worksheet(page_title)
        except HttpError:
            wks = sh.worksheet_by_title(page_title)
        # Clear worksheet.
        wks.clear()
        # Add data from DataFrame.
        wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1')
        # Apply formatting and conditions.
        header['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, header)
        # Format numbers.
        for col in number_format_columns:
            num = idr_data.columns.get_loc(col)
            number_format['repeatCell']['range']['startColumnIndex'] = num
            number_format['repeatCell']['range']['endColumnIndex'] = num + 1
            number_format['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, number_format)
        # Resize font.
        font_size_format['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format)
        # Add conditional formatting.
        for conditional in conditions:
            num = idr_data.columns.get_loc("reproducibility_test")
            conditional['addConditionalFormatRule']['rule']['ranges'][0]['startColumnIndex'] = num
            conditional['addConditionalFormatRule']['rule']['ranges'][0]['endColumnIndex'] = num + 1
            conditional['addConditionalFormatRule']['rule']['ranges'][0]['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, conditional)
        for k, v in notes_dict.items():
            num = idr_data.columns.get_loc(k)
            note['repeatCell']['range']['startColumnIndex'] = num
            note['repeatCell']['range']['endColumnIndex'] = num + 1
            note['repeatCell']['cell']['note'] = v
            note['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, note)
        # Optional. Smaller column width to match original.
        for i in range(wks.cols):
            wks.adjust_column_width(i, pixel_size=38)
        # Resize tiny columns.
        tiny_columns = ['experiment',
                        'analysis']
        for i in [idr_data.columns.get_loc(x) for x in tiny_columns]:
            wks.adjust_column_width(i, pixel_size=25)
        # Resize medium columns.
        medium_columns = ['replication',
                          'assembly',
                          'rfa']
        for i in [idr_data.columns.get_loc(x) for x in medium_columns]:
            wks.adjust_column_width(i, pixel_size=65)
        # Resize wide columns.
        wide_columns = ['target',
                        'reproducibility_test',
                        'lab']
        for i in [idr_data.columns.get_loc(x) for x in wide_columns]:
            wks.adjust_column_width(i, pixel_size=85)
        # Remove temp file.
        os.remove(temp_file)
def main():

	args = get_args()
	if args.debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	if args.analysis_ids:
		ids = args.analysis_ids
	elif args.created_after:
		analyses = []
		for state in args.state:
			analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)))
		ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')]
	elif args.infile:
		ids = args.infile
	else:
		#never reached because inile defaults to stdin
		raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after")

	fieldnames = [	'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly',
					'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test',
					'state','total price','notes']
	writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"')
	writer.writeheader()

	for (i, analysis_id) in enumerate(ids):
		if analysis_id.startswith('#'):
			continue
		analysis_id = analysis_id.rstrip()
		logger.debug('%s' %(analysis_id))
		analysis = dxpy.DXAnalysis(analysis_id)
		desc = analysis.describe()
		project = desc.get('project')

		m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name'])
		if m:
			experiment_accession = m.group(1)
		else:
			logger.error("No accession in %s, skipping." %(desc['name']))
			continue

		experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair)
		logger.debug('ENCODEd experiment %s' %(experiment['accession']))
		if args.lab and experiment['lab'].split('/')[2] not in args.lab:
			continue
		try:
			idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls")
		except:
			logging.error('Failed to find final IDR stage in %s' %(analysis_id))
		else:
			if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
				Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
				notes = []
				#note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
				idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates']
				for stage_name in idr_stage_names:
					try:
						idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name)
					except StopIteration:
						continue
					except:
						raise
					if idr_stage['state'] == 'failed':
						try:
							job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT)
						except subprocess.CalledProcessError as e:
							job_log = e.output
						else:
							job_log = None
						if job_log:
							patterns = [r'Peak files must contain at least 20 peaks post-merge']
							for p in patterns:
								m = re.search(p,job_log)
								if m:
									notes.append("%s: %s" %(stage_name,m.group(0)))
						if not notes:
							notes.append(idr_stage['failureMessage'])
				try:
					done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed")
				except StopIteration:
					done_time = "Not done or failed"
				except:
					raise
			else:
				Np = idr_stage['output'].get('Np')
				N1 = idr_stage['output'].get('N1')
				N2 = idr_stage['output'].get('N2')
				Nt = idr_stage['output'].get('Nt')
				rescue_ratio = idr_stage['output'].get('rescue_ratio')
				self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio')
				reproducibility_test = idr_stage['output'].get('reproducibility_test')
				notes = "IDR Complete"
				done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done")

		if done_time:
			date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000))
		else:
			date = "Running"
		analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1])
		experiment_link = 'https://www.encodeproject.org/experiments/%s' %(experiment.get('accession'))
		row = {
			'date': date,
			'analysis':		analysis_link,
			'experiment': 	experiment_link,
			'target':		experiment['target'].split('/')[2],
			'biosample_term_name':	experiment.get('biosample_term_name'),
			'biosample_type':	experiment.get('biosample_type'),
			'lab':			experiment['lab'].split('/')[2],
			'rfa':			common.encoded_get(server+experiment.get('award'),keypair).get('rfa'),
			'assembly':		args.assembly, #TODO ... derive this from the analysis
			'Np':			Np,
			'N1':			N1,
			'N2':			N2,
			'Nt':			Nt,
			'rescue_ratio':	rescue_ratio,
			'self_consistency_ratio': self_consistency_ratio,
			'reproducibility_test': reproducibility_test,
			'state': 		desc.get('state'),
			'total price': 	desc.get('totalPrice')
		}

		if notes:
			row.update({'notes': '%s' %(notes)})
		else:
			row.update({'notes': '%s' %('OK')})
		#log = subprocess.check_output('dx watch %s' %(analysis.))
		writer.writerow(row)
def main():
    args = get_args()
    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    experiments = []
    if args.experiments:
        experiments.extend(args.experiments)
    if args.infile:
        with open(args.infile, 'r') as fh:
            experiments.extend([e for e in fh])

    if args.control:
        control_dxhandler = resolve_dx_file(args.control)
    else:
        control_dxhandler = None

    for exp_id in experiments:
        if exp_id.startswith('#'):
            continue
        exp_id = exp_id.rstrip()
        print("Experiment %s" % (exp_id))
        experiment_url = server + '/experiments/%s/' % (exp_id)
        experiment = common.encoded_get(experiment_url, keypair)
        if experiment.get('target'):
            target_url = server + experiment.get('target')
            target = common.encoded_get(target_url, keypair)
        else:
            logging.error('Experiment has no target ... skipping')
            continue

        print(
            "%s %s %s"
            % (experiment['accession'], target.get('investigated_as'),
               experiment.get('description')))

        tas = get_tas(experiment, server, keypair, args.project, args.inf, control_dxhandler)
        if not tas:
            logging.error(
                'Failed to resolve all tagaligns for %s'
                % (experiment['accession']))
            continue
        if not tas.get('rep2_ta'):
            simplicate_experiment = True
            print("Simplicate experiment ta's:")
        else:
            simplicate_experiment = False
            print("Replicated experiment ta's:")
        pprint(tas)
        # sys.exit()
        # continue

        for key, value in tas.iteritems():
            if not value:
                logging.error('Missing %s ... skipping' % (key))
                continue

        workflow_title = '%s Peaks' % (exp_id)
        if args.tag:
            workflow_title += ' %s' % (args.tag)
        outf = args.outf

        if not outf.startswith('/') and outf != '/':
            outf = '/'+outf
        if not outf.endswith('/') and outf != '/':
            outf += '/'
        outf += '%s/peaks/' % (exp_id)

        try:
            investigated_as = target['investigated_as']
        except:
            logging.error(
                "%s: Failed to determine target type ... skipping" % (exp_id))
            continue
        else:
            print(investigated_as)

        rep1_pe = tas['rep1_ta']['paired_end']
        if not simplicate_experiment:
            rep2_pe = tas['rep2_ta']['paired_end']
        else:
            rep2_pe = None

        if simplicate_experiment and rep1_pe is None:
            logging.error(
                "%s: Cannot determine paired end: rep1 PE = %s... skipping"
                % (exp_id, rep1_pe))
            continue
        elif not simplicate_experiment and None in [rep1_pe, rep2_pe]:
            logging.error(
                "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping"
                % (exp_id, rep1_pe, rep2_pe))
            continue

        if not simplicate_experiment and rep1_pe != rep2_pe:
            logging.error(
                "%s: rep1 PE %s differs from rep2 PE %s ... skipping"
                % (exp_id, rep1_pe, rep2_pe))
            continue

        if any('histone' in target_type for target_type in investigated_as):
            logging.info(
                "%s: Found to be histone.  No blacklist will be used."
                % (exp_id))
            wf_target = 'histone'
            blacklist = None
        else:
            logging.info("Assumed to be tf")
            wf_target = 'tf'
            if not args.blacklist:
                if args.assembly in ASSEMBLY_METADATA:
                    blacklist = ASSEMBLY_METADATA[args.assembly]['blacklist']
                else:
                    logging.warning(
                        "%s: No blacklist for assembly %s, proceeding with no blacklist"
                        % (exp_id, args.assembly))
                    blacklist = None

        if not args.gsize:
            if args.assembly in ASSEMBLY_METADATA:
                genomesize = ASSEMBLY_METADATA[args.assembly]['gsize']
            else:
                logging.error(
                    "%s: Must specify -gsize for assembly %s"
                    % (exp_id, args.assembly))
        else:
            genomesize = args.gsize

        if not args.csizes:
            if args.assembly in ASSEMBLY_METADATA:
                chrom_sizes = ASSEMBLY_METADATA[args.assembly]['csizes']
            else:
                logging.error(
                    "%s: Must specify -csizes for assembly %s"
                    % (exp_id, args.assembly))
        else:
            chrom_sizes = args.csizes
        chip_workflow_absolute_path = os.path.dirname(os.path.realpath(__file__)) + "/chip_workflow.py"
        command_strings = [
            chip_workflow_absolute_path,
            '--nomap --yes',
            '--target %s' % (wf_target),
            '--title "%s"' % (workflow_title),
            '--outf "%s"' % (outf),
            '--rep1pe %s' % (str(rep1_pe).lower()),
            '--rep1 %s' % (tas['rep1_ta'].get('file_id')),
            '--ctl1 %s' % (tas['rep1_ta'].get('control_id')),
            '--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes),
            '--spp_version %s' % (args.spp_version)
        ]

        if not simplicate_experiment:
            command_strings.extend([
                '--rep2pe %s' % (str(rep2_pe).lower()),
                '--rep2 %s' % (tas['rep2_ta'].get('file_id')),
                '--ctl2 %s' % (tas['rep2_ta'].get('control_id')),
            ])
        if args.spp_instance:
            command_strings.append('--spp_instance %s' % str(args.spp_instance))
        if args.fragment_length:
            command_strings.append('--fragment_length %s' % str(args.fragment_length))
        if blacklist:
            command_strings.append('--blacklist "%s"' % (blacklist))
        if args.debug:
            command_strings.append('--debug')
        if args.use_existing_folders:
            command_strings.append('--use_existing_folders')
        if args.accession:
            command_strings.append('--accession')
            if args.fqcheck is not None:
                command_strings.append('--fqcheck=%s' % (args.fqcheck))
            if args.skip_control is not None:
                command_strings.append('--skip_control=%s' % (args.skip_control))
            if args.force_patch is not None:
                command_strings.append('--force_patch=%s' % (args.force_patch))
        run_command = ' '.join(command_strings)
        print(run_command)

        if args.dryrun:
            logging.info('Dryrun')
        else:
            try:
                subprocess.check_call(run_command, shell=True)
            except subprocess.CalledProcessError as e:
                logging.error(
                    "%s: chip_workflow exited with non-zero code %d"
                    % (exp_id, e.returncode))
            else:
                print("%s workflow created" % (experiment['accession']))
                logging.debug(
                    "%s: patching internal_status to url %s"
                    % (exp_id, experiment_url))
                r = common.encoded_patch(
                    experiment_url, keypair, {'internal_status': 'processing'},
                    return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.warning(
                        "%s: Failed to update experiment internal_status to processing. Skipping that update."
                        % (exp_id))
                    logging.debug(r.text)
def main():
    args = get_args()
    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid,authpw)

    experiments = []
    if args.experiments:
        experiments.extend(args.experiments)
    if args.infile:
        with open(args.infile,'r') as fh:
            experiments.extend([e for e in fh])

    for exp_id in experiments:
        if exp_id.startswith('#'):
            continue
        exp_id = exp_id.rstrip()
        print "Experiment %s" %(exp_id)
        experiment_url = server + '/experiments/%s/' %(exp_id)
        experiment = common.encoded_get(experiment_url, keypair)
        if experiment.get('target'):
            target_url = server + experiment.get('target')
            target = common.encoded_get(target_url, keypair)
        else:
            logging.error('Experiment has no target ... skipping')
            continue

        print "%s %s %s" %(experiment['accession'], target.get('investigated_as'), experiment.get('description'))
        # ctl_id = get_control_id(experiment)
        # if ctl_id:
        #   print "Control %s" %(ctl_id)
        # else:
        #   print "Found no control ... skipping %s" %(exp_id)
        #   continue
        # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf)
        # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf)

        tas = get_tas(experiment, server, keypair, args.project, args.inf)
        if not tas:
            logging.error('Failed to resolve all tagaligns for %s' %(experiment['accession']))
            continue

        pprint.pprint(tas)
        # sys.exit()
        #continue

        skip_flag = False
        for key,value in tas.iteritems():
            if not value:
                logging.error('Missing %s ... skipping' %(key))
                skip_flag = True
        if skip_flag:
            continue

        workflow_title = '%s Peaks' %(exp_id)
        if args.tag:
            workflow_title += ' %s' %(args.tag)
        outf = args.outf

        if not outf.startswith('/') and outf != '/':
            outf = '/'+outf
        if not outf.endswith('/') and outf != '/':
            outf += '/'
        outf += '%s/peaks/' %(exp_id)

        try:
            investigated_as = target['investigated_as']
        except:
            print "%s: Failed to determine target type ... skipping" %(exp_id)
            continue
        else:
            print investigated_as

        rep1_pe = tas['rep1_ta']['paired_end']
        rep2_pe = tas['rep2_ta']['paired_end']

        if None in [rep1_pe, rep2_pe]:
            print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (
                exp_id,
                rep1_pe,
                rep2_pe)
            continue
        if rep1_pe != rep2_pe:
            print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (
                exp_id,
                rep1_pe,
                rep2_pe)
            continue

        if any('histone' in target_type for target_type in investigated_as):
            print "Found to be histone.  No blacklist will be used."
            IDR_default = False
            workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py'
            blacklist = None
        else:
            print "Assumed to be tf"
            IDR_default = True
            workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py'
            if args.assembly == "hg19":
                blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz"
            else:
                print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" %(args.assembly)
                blacklist = None

        run_command = \
            '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \
            '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \
            '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \
            '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \
            '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes)
        if blacklist:
            run_command += ' --blacklist "%s"' %(blacklist)
        if args.debug:
            run_command += ' --debug'
        if args.idr or IDR_default:
            run_command += ' --idr --idrversion %s' %(args.idrversion)

        print run_command
        if args.dryrun:
            logging.info('Dryrun')
        else:
            try:
                subprocess.check_call(run_command, shell=True)
            except subprocess.CalledProcessError as e:
                logging.error("%s exited with non-zero code %d" %(workflow_spinner, e.returncode))
            else:
                print "%s workflow created" %(experiment['accession'])
                logging.debug("patching internal_status to url %s" %(experiment_url))
                r = common.encoded_patch(experiment_url, keypair, {'internal_status':'processing'}, return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error("Tried but failed to update experiment internal_status to processing")
                    logging.error(r.text)
示例#32
0
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        exp_ids = csv.reader(
            StringIO.StringIO('\n'.join([s.rstrip()
                                         for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for row in exp_ids:
        if row[0].startswith('#'):
            continue
        exp_id = row[0].strip()
        if len(row) > 1:
            repns = []
            for s in row[1:]:
                repns.extend(s.split(','))
            map_only_reps = list(set([int(s) for s in repns]))
        else:
            map_only_reps = []
        outstrings = []
        encode_url = urlparse.urljoin(server, exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.no_sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, map_only_reps)
        biorep_numbers = \
            set([rep.get('biological_replicate_number') for rep in replicates])
        in_process = False
        if files:
            for biorep_n in biorep_numbers:
                outstrings.append('rep%s' % (biorep_n))
                biorep_files = [
                    f for f in files
                    if biorep_n in common.biorep_ns(f, server, keypair)
                ]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get(
                            'paired_end'
                    ) == None:  # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1', '2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get(
                                '@id') == file_object.get('paired_with')),
                                        None)
                        else:  #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get(
                                'paired_with') == file_object.get('@id')),
                                        None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %
                                            (experiment.get('accession'),
                                             file_object.get('accession')))
                            mate = {}

                        # if mapping as SE, ignore the mate and just map the
                        # rep1 as SE with all the other SE for this rep, if any
                        if args.force_se:
                            unpaired_files.append(
                                next(f for f in [file_object, mate]
                                     if f.get('paired_end') == '1'))
                        else:
                            paired_files.append((file_object, mate))

                if biorep_files:
                    logging.warning(
                        '%s: leftover file(s) %s' %
                        (experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = \
                        map_only(experiment, biorep_n, paired_files,
                                 server, keypair, args.sex_specific,
                                 args.crop_length, args.accession,
                                 args.fqcheck, args.force_patch,
                                 args.use_existing_folders, args.encoded_check)
                    in_process = True
                if unpaired_files:
                    se_jobs = \
                        map_only(experiment, biorep_n, unpaired_files,
                                 server, keypair, args.sex_specific,
                                 args.crop_length, args.accession,
                                 args.fqcheck, args.force_patch,
                                 args.use_existing_folders, args.encoded_check)
                    in_process = True
                if paired_files and pe_jobs:
                    outstrings.append(
                        'paired:%s' %
                        ([(a.get('accession'), b.get('accession'))
                          for (a, b) in paired_files]))
                    outstrings.append('paired jobs:%s' %
                                      ([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' % (None))
                if unpaired_files and se_jobs:
                    outstrings.append(
                        'unpaired:%s' %
                        ([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %
                                      ([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' % (None))
            if in_process:
                r = common.encoded_patch(encode_url,
                                         keypair,
                                         {"internal_status": "processing"},
                                         return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error("Tried and failed to set internal_status")
                    logging.error(r.text)
            print('\t'.join(outstrings))
        else:  # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %
                                experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %
                                experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %
                            experiment.get('accession'))
示例#33
0
def main():
    global args
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        exp_ids = csv.reader(
            StringIO.StringIO('\n'.join([s.rstrip()
                                         for s in args.experiments])))
    else:
        exp_ids = csv.reader(args.infile)

    for instring in exp_ids:
        exp_id = instring[0].strip()
        if len(instring) > 1:
            repns = []
            for s in instring[1:]:
                repns.extend(s.split(','))
            biorep_ns = list(set([int(s) for s in repns]))
        else:
            biorep_ns = []
        outstrings = []
        encode_url = urlparse.urljoin(server, exp_id)
        experiment = common.encoded_get(encode_url, keypair)
        outstrings.append(exp_id)
        files = files_to_map(experiment, server, keypair, args.sfn_dupes)
        outstrings.append(str(len(files)))
        outstrings.append(str([f.get('accession') for f in files]))
        replicates = replicates_to_map(files, server, keypair, biorep_ns)

        if files:
            for biorep_n in set(
                [rep.get('biological_replicate_number')
                 for rep in replicates]):
                outstrings.append('rep%s' % (biorep_n))
                biorep_files = [
                    f for f in files
                    if biorep_n in common.biorep_ns(f, server, keypair)
                ]
                paired_files = []
                unpaired_files = []
                while biorep_files:
                    file_object = biorep_files.pop()
                    if file_object.get(
                            'paired_end'
                    ) == None:  # group all the unpaired reads for this biorep together
                        unpaired_files.append(file_object)
                    elif file_object.get('paired_end') in ['1', '2']:
                        if file_object.get('paired_with'):
                            mate = next((f for f in biorep_files if f.get(
                                '@id') == file_object.get('paired_with')),
                                        None)
                        else:  #have to find the file that is paired with this one
                            mate = next((f for f in biorep_files if f.get(
                                'paired_with') == file_object.get('@id')),
                                        None)
                        if mate:
                            biorep_files.remove(mate)
                        else:
                            logging.warning('%s:%s could not find mate' %
                                            (experiment.get('accession'),
                                             file_object.get('accession')))
                            mate = {}
                        paired_files.append((file_object, mate))
                if biorep_files:
                    logging.warning(
                        '%s: leftover file(s) %s' %
                        (experiment.get('accession'), biorep_files))
                if paired_files:
                    pe_jobs = map_only(experiment, biorep_n, paired_files,
                                       args.key, server, keypair)
                if unpaired_files:
                    se_jobs = map_only(experiment, biorep_n, unpaired_files,
                                       args.key, server, keypair)
                if paired_files and pe_jobs:
                    outstrings.append(
                        'paired:%s' %
                        ([(a.get('accession'), b.get('accession'))
                          for (a, b) in paired_files]))
                    outstrings.append('paired jobs:%s' %
                                      ([j.get_id() for j in pe_jobs]))
                else:
                    outstrings.append('paired:%s' % (None))
                if unpaired_files and se_jobs:
                    outstrings.append(
                        'unpaired:%s' %
                        ([f.get('accession') for f in unpaired_files]))
                    outstrings.append('unpaired jobs:%s' %
                                      ([j.get_id() for j in se_jobs]))
                else:
                    outstrings.append('unpaired:%s' % (None))

            print '\t'.join(outstrings)
        else:  # no files
            if not replicates:
                logging.warning('%s: No files and no replicates' %
                                experiment.get('accession'))
            else:
                logging.warning('%s: No files to map' %
                                experiment.get('accession'))
        if files and not replicates:
            logging.warning('%s: Files but no replicates' %
                            experiment.get('accession'))
def main():
	args = get_args()
	authid, authpw, server = common.processkey(args.key, args.keyfile)
	keypair = (authid,authpw)

	for exp_id in args.infile:
		if exp_id.startswith('#'):
			continue
		exp_id = exp_id.rstrip()
		print "Experiment %s" %(exp_id)
		url = server + '/experiments/%s/' %(exp_id)
		experiment = common.encoded_get(url, keypair)
		if experiment.get('target'):
			url = server + experiment.get('target')
			target = common.encoded_get(url, keypair)
		else:
			logging.error('Experiment has no target ... skipping')
			continue

		print "%s %s %s" %(experiment['accession'], target.get('investigated_as'), experiment.get('description'))
		# ctl_id = get_control_id(experiment)
		# if ctl_id:
		# 	print "Control %s" %(ctl_id)
		# else:
		# 	print "Found no control ... skipping %s" %(exp_id)
		# 	continue
		# (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf)
		# (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf)

		tas = get_tas(experiment, server, keypair, args.project, args.inf)
		if not tas:
			logging.error('Failed to resolve all tagaligns for %s' %(experiment['accession']))
			continue

		pprint.pprint(tas)
		# sys.exit()
		#continue

		skip_flag = False
		for key,value in tas.iteritems():
			if not value:
				logging.error('Missing %s ... skipping' %(key))
				skip_flag = True
		if skip_flag:
			continue

		workflow_name = '%s Peaks' %(exp_id)
		if args.tag:
			workflow_name += ' %s' %(args.tag)
		outf = args.outf

		if not outf.startswith('/') and outf != '/':
			outf = '/'+outf
		if not outf.endswith('/') and outf != '/':
			outf += '/'
		outf += '%s/peaks/' %(exp_id)
		try:
			investigated_as = target['investigated_as']
		except:
			print "Failed to determine target type ... skipping %s" %(exp_id)
			continue
		else:
			print investigated_as
		if any('histone' in target_type for target_type in investigated_as):
			print "Found to be histone"
			workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py'
		else:
			print "Assumed to be tf"
			workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py'
		run_command = \
			'%s --name "%s" --outf "%s" --nomap --yes ' %(workflow_spinner, workflow_name, outf) + \
			'--rep1pe false --rep2pe false ' + \
			'--rep1 %s --rep2 %s ' %(tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \
			'--ctl1 %s --ctl2 %s ' %(tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \
			'--genomesize %s --chrom_sizes "%s" ' %(args.gsize, args.csizes) + \
			'--blacklist "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz"'
		if args.debug:
			run_command += ' --debug'
		if args.idr:
			run_command += ' --idr --idrversion %s' %(args.idrversion)

		print run_command
		if args.dryrun:
			logging.info('Dryrun')
		else:
			try:
				subprocess.check_call(run_command, shell=True)
			except subprocess.CalledProcessError as e:
				logging.error("%s exited with non-zero code %d" %(workflow_spinner, e.returncode))
			else:
				print "%s workflow created" %(experiment['accession'])
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.query:
        r = requests.get(
            args.query, auth=keypair, headers={"content-type": "application/json", "accept": "application/json"}
        )
        experiments = r.json()["@graph"]
        exp_ids = [e["accession"] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        logger.info("%s" % (exp_id))

        url = urlparse.urljoin(server, "/experiments/%s" % (exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [
            common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair)
            for uri in experiment_object.get("original_files")
        ]
        bams = [
            f
            for f in original_files
            if f.get("file_format") == "bam" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        fastqs = [
            f
            for f in original_files
            if f.get("file_format") == "fastq" and f.get("status") not in ["revoked", "deleted", "replaced"]
        ]
        for f in fastqs:
            f["replicate"] = common.encoded_get(urlparse.urljoin(server, "%s" % (f.get("replicate"))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get("accession"), server, keypair)
            if len(bioreps) != 1:
                logger.error(
                    "Expected to find 1 biorep for bam %s, found %d.  Skipping." % (bam.get("accession"), len(bioreps))
                )
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [
                    common.encoded_get(urlparse.urljoin(server, "%s" % (uri)), keypair)
                    for uri in bam.get("derived_from")
                ]
            except:
                derived_from = None
            if not derived_from:
                logger.error("bam %s is derived from nothing. Skipping" % (bam.get("accession")))
                continue
            for f in derived_from:
                if f.get("file_format") != "fastq":
                    logger.error(
                        "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files."
                        % (bam.get("accession"), f.get("accession"))
                    )
                    continue
                try:
                    if common.after(f.get("date_created"), bam.get("date_created")):
                        logger.error(
                            "Date conflict. Bam %s is derived from newer Fastq %s"
                            % (bam.get("accession"), f.get("accession"))
                        )
                except:
                    logger.error(
                        "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files."
                        % (bam.get("date_created"), f.get("date_created"))
                    )
                    continue
            for f in fastqs:
                if f.get("replicate").get("biological_replicate_number") == bam_biorep:
                    if common.after(f.get("date_created"), bam.get("date_created")):
                        logger.info(
                            "bam %s is out-of-date.  fastq %s is newer" % (bam.get("accession"), f.get("accession"))
                        )
                        if re.search("control", experiment_object.get("target").lower()):
                            logger.info(
                                "WARNING, %s is a control experiment so many other experiments may be out-of-date."
                                % (experiment_object.get("accession"))
                            )
示例#36
0
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.analysis_ids:
        ids = args.analysis_ids
    elif args.created_after:
        analyses = []
        for state in args.state:
            analyses.extend(
                dxpy.find_analyses(name="ENCSR*",
                                   name_mode='glob',
                                   state=state,
                                   include_subjobs=True,
                                   return_handler=True,
                                   created_after="%s" % (args.created_after)))
        ids = [
            analysis.get_id() for analysis in analyses
            if analysis.describe()['executableName'] == 'tf_chip_seq'
            or analysis.describe()['executableName'].startswith(
                'ENCSR783QUL Peaks')
        ]
    elif args.infile:
        ids = args.infile
    else:
        #never reached because inile defaults to stdin
        raise InputError(
            "Must supply analysis id's in arguments, --infile or supply search string in --created_after"
        )

    fieldnames = [
        'date', 'analysis', 'experiment', 'target', 'biosample_term_name',
        'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2',
        'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test',
        'state', 'total price', 'notes'
    ]
    writer = csv.DictWriter(sys.stdout,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()

    for (i, analysis_id) in enumerate(ids):
        if analysis_id.startswith('#'):
            continue
        analysis_id = analysis_id.rstrip()
        logger.debug('%s' % (analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get('project')

        m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name'])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc['name']))
            continue

        experiment = common.encoded_get(
            urlparse.urljoin(server,
                             '/experiments/%s' % (experiment_accession)),
            keypair)
        logger.debug('ENCODEd experiment %s' % (experiment['accession']))
        if args.lab and experiment['lab'].split('/')[2] not in args.lab:
            continue
        try:
            idr_stage = next(
                s['execution'] for s in desc['stages']
                if s['execution']['name'] == "Final IDR peak calls")
        except:
            logging.error('Failed to find final IDR stage in %s' %
                          (analysis_id))
        else:
            if idr_stage[
                    'state'] != 'done':  #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = [
                    'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates',
                    'IDR Rep 2 Self-pseudoreplicates',
                    'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'
                ]
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(
                            s['execution'] for s in desc['stages']
                            if s['execution']['name'] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage['state'] == 'failed':
                        try:
                            job_log = subprocess.check_output(
                                'dx watch %s' % (idr_stage['id']),
                                shell=True,
                                stderr=subprocess.STDOUT)
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [
                                r'Peak files must contain at least 20 peaks post-merge'
                            ]
                            for p in patterns:
                                m = re.search(p, job_log)
                                if m:
                                    notes.append("%s: %s" %
                                                 (stage_name, m.group(0)))
                        if not notes:
                            notes.append(idr_stage['failureMessage'])
                try:
                    done_time = next(transition['setAt']
                                     for transition in desc['stateTransitions']
                                     if transition['newState'] == "failed")
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage['output'].get('Np')
                N1 = idr_stage['output'].get('N1')
                N2 = idr_stage['output'].get('N2')
                Nt = idr_stage['output'].get('Nt')
                rescue_ratio = idr_stage['output'].get('rescue_ratio')
                self_consistency_ratio = idr_stage['output'].get(
                    'self_consistency_ratio')
                reproducibility_test = idr_stage['output'].get(
                    'reproducibility_test')
                notes = "IDR Complete"
                done_time = next(transition['setAt']
                                 for transition in desc['stateTransitions']
                                 if transition['newState'] == "done")

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime(done_time / 1000))
        else:
            date = "Running"
        analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
            desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        experiment_link = 'https://www.encodeproject.org/experiments/%s' % (
            experiment.get('accession'))
        row = {
            'date':
            date,
            'analysis':
            analysis_link,
            'experiment':
            experiment_link,
            'target':
            experiment['target'].split('/')[2],
            'biosample_term_name':
            experiment.get('biosample_term_name'),
            'biosample_type':
            experiment.get('biosample_type'),
            'lab':
            experiment['lab'].split('/')[2],
            'rfa':
            common.encoded_get(server + experiment.get('award'),
                               keypair).get('rfa'),
            'assembly':
            args.assembly,  #TODO ... derive this from the analysis
            'Np':
            Np,
            'N1':
            N1,
            'N2':
            N2,
            'Nt':
            Nt,
            'rescue_ratio':
            rescue_ratio,
            'self_consistency_ratio':
            self_consistency_ratio,
            'reproducibility_test':
            reproducibility_test,
            'state':
            desc.get('state'),
            'total price':
            desc.get('totalPrice')
        }

        if notes:
            row.update({'notes': '%s' % (notes)})
        else:
            row.update({'notes': '%s' % ('OK')})
        #log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)
def main(outfn, assembly, debug, key, keyfile, dryrun, force, pipeline, analysis_ids=None, infile=None, project=None):

	if debug:
		logger.info('setting logger level to logging.DEBUG')
		logger.setLevel(logging.DEBUG)
	else:
		logger.info('setting logger level to logging.INFO')
		logger.setLevel(logging.INFO)

	if infile is not None:
		infile = dxpy.DXFile(infile)
		dxpy.download_dxfile(infile.get_id(), "infile")
		ids = open("infile",'r')
	elif analysis_ids is not None:
		ids = analysis_ids
	else:
		logger.error("Must supply one of --infile or a list of one or more analysis-ids")
		return

	authid, authpw, server = common.processkey(key, keyfile)
	keypair = (authid,authpw)

	common_metadata.update({'assembly': assembly})

	with open(outfn, 'w') as fh:
		if dryrun:
			fh.write('---DRYRUN: No files have been modified---\n')
		fieldnames = ['analysis','experiment','assembly','dx_pipeline','files','error']
		output_writer = csv.DictWriter(fh, fieldnames, delimiter='\t')
		output_writer.writeheader()

		for (i, analysis_id) in enumerate(ids):
			logger.debug('debug %s' %(analysis_id))
			analysis = dxpy.describe(analysis_id.strip())
			experiment = get_experiment_accession(analysis)
			output = {
				'analysis': analysis_id,
				'experiment': experiment,
				'assembly': assembly
			}
			logger.info('Accessioning analysis name %s executableName %s' %(analysis.get('name'), analysis.get('executableName')))

			if analysis.get('name') == 'histone_chip_seq':
				output.update({'dx_pipeline':'histone_chip_seq'})
				try:
					accessioned_files = accession_peaks_analysis_files(analysis, keypair, server, dryrun, force)
				except:
					accessioned_files = None
					output.update({'error':sys.exc_info()[0]})
				else:
					output.update({'error':""})
			elif analysis.get('executableName') == 'ENCODE mapping pipeline':
				output.update({'dx_pipeline':'ENCODE mapping pipeline'})
				try:
					accessioned_files = accession_mapping_analysis_files(analysis, keypair, server, dryrun, force)
				except:
					accessioned_files = None
					output.update({'error':sys.exc_info()[0]})
				else:
					output.update({'error':""})
			else:
				logger.error('unrecognized analysis pattern %s %s ... skipping.' %(analysis.get('name'), analysis.get('executableName')))
				output.update({'dx_pipeline':'unrecognized'})
				accessioned_files = None
				output.update({'error':'unrecognized analysis pattern %s %s' %(analysis.get('name'), analysis.get('executableName'))})

			file_accessions = [f.get('accession') for f in (accessioned_files or [])]
			logger.info("Accessioned: %s" %(file_accessions))
			output.update({'files':file_accessions})
			output_writer.writerow(output)

	common.touch(outfn)
	outfile = dxpy.upload_local_file(outfn)

	output = {}
	output["outfile"] = dxpy.dxlink(outfile)

	return output
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.analysis_ids:
        ids = args.analysis_ids
    elif args.created_after:
        analyses = []
        for state in args.state:
            analyses.extend(
                dxpy.find_analyses(
                    name="ENCSR*",
                    name_mode="glob",
                    state=state,
                    include_subjobs=True,
                    return_handler=True,
                    created_after="%s" % (args.created_after),
                )
            )
        ids = [
            analysis.get_id()
            for analysis in analyses
            if analysis.describe()["executableName"] == "tf_chip_seq"
            or analysis.describe()["executableName"].startswith("ENCSR783QUL Peaks")
        ]
    elif args.infile:
        ids = args.infile
    else:
        # never reached because inile defaults to stdin
        raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after")

    fieldnames = [
        "name",
        "date",
        "analysis",
        "experiment",
        "target",
        "biosample_term_name",
        "biosample_type",
        "lab",
        "rfa",
        "assembly",
        "Nt",
        "Np",
        "N1",
        "N2",
        "rescue_ratio",
        "self_consistency_ratio",
        "reproducibility_test",
        "state",
        "release",
        "total price",
        "notes",
    ]
    writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter="\t", quotechar='"')
    writer.writeheader()

    for (i, analysis_id) in enumerate(ids):
        if analysis_id.startswith("#"):
            continue
        analysis_id = analysis_id.rstrip()
        logger.debug("%s" % (analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get("project")

        m = re.match("^(ENCSR[0-9]{3}[A-Z]{3}) Peaks", desc["name"])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc["name"]))
            continue

        experiment = common.encoded_get(urlparse.urljoin(server, "/experiments/%s" % (experiment_accession)), keypair)
        logger.debug("ENCODEd experiment %s" % (experiment["accession"]))
        if args.lab and experiment["lab"].split("/")[2] not in args.lab:
            continue
        try:
            idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == "Final IDR peak calls")
        except:
            logging.error("Failed to find final IDR stage in %s" % (analysis_id))
        else:
            if (
                idr_stage["state"] != "done"
            ):  # Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                # note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = [
                    "IDR True Replicates",
                    "IDR Rep 1 Self-pseudoreplicates",
                    "IDR Rep 2 Self-pseudoreplicates",
                    "IDR Pooled Pseudoreplicates",
                    "IDR Pooled Pseudoeplicates",
                ]
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage["state"] == "failed":
                        try:
                            job_log = subprocess.check_output(
                                "dx watch %s" % (idr_stage["id"]), shell=True, stderr=subprocess.STDOUT
                            )
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [r"Peak files must contain at least 20 peaks post-merge"]
                            for p in patterns:
                                m = re.search(p, job_log)
                                if m:
                                    notes.append("%s: %s" % (stage_name, m.group(0)))
                        if not notes:
                            notes.append(idr_stage["failureMessage"])
                try:
                    done_time = next(
                        transition["setAt"]
                        for transition in desc["stateTransitions"]
                        if transition["newState"] == "failed"
                    )
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage["output"].get("Np")
                N1 = idr_stage["output"].get("N1")
                N2 = idr_stage["output"].get("N2")
                Nt = idr_stage["output"].get("Nt")
                rescue_ratio = idr_stage["output"].get("rescue_ratio")
                self_consistency_ratio = idr_stage["output"].get("self_consistency_ratio")
                reproducibility_test = idr_stage["output"].get("reproducibility_test")
                notes = "IDR Complete"
                done_time = next(
                    transition["setAt"] for transition in desc["stateTransitions"] if transition["newState"] == "done"
                )

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000))
        else:
            date = "Running"
        analysis_link = "https://platform.dnanexus.com/projects/%s/monitor/analysis/%s" % (
            desc.get("project").split("-")[1],
            desc.get("id").split("-")[1],
        )
        experiment_link = "https://www.encodeproject.org/experiments/%s" % (experiment.get("accession"))
        row = {
            "name": desc.get("name"),
            "date": date,
            "analysis": analysis_link,
            "experiment": experiment_link,
            "target": experiment["target"].split("/")[2],
            "biosample_term_name": experiment.get("biosample_term_name"),
            "biosample_type": experiment.get("biosample_type"),
            "lab": experiment["lab"].split("/")[2],
            "rfa": common.encoded_get(server + experiment.get("award"), keypair).get("rfa"),
            "assembly": args.assembly,  # TODO ... derive this from the analysis
            "Np": Np,
            "N1": N1,
            "N2": N2,
            "Nt": Nt,
            "rescue_ratio": rescue_ratio,
            "self_consistency_ratio": self_consistency_ratio,
            "reproducibility_test": reproducibility_test,
            "state": desc.get("state"),
            "release": experiment["status"],
            "total price": desc.get("totalPrice"),
        }

        if notes:
            row.update({"notes": "%s" % (notes)})
        else:
            row.update({"notes": "%s" % ("OK")})
            # log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)
def main():
    args = get_args()
    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    experiments = []
    if args.experiments:
        experiments.extend(args.experiments)
    if args.infile:
        with open(args.infile, 'r') as fh:
            experiments.extend([e for e in fh])

    for exp_id in experiments:
        if exp_id.startswith('#'):
            continue
        exp_id = exp_id.rstrip()
        print "Experiment %s" % (exp_id)
        experiment_url = server + '/experiments/%s/' % (exp_id)
        experiment = common.encoded_get(experiment_url, keypair)
        if experiment.get('target'):
            target_url = server + experiment.get('target')
            target = common.encoded_get(target_url, keypair)
        else:
            logging.error('Experiment has no target ... skipping')
            continue

        print "%s %s %s" % (experiment['accession'],
                            target.get('investigated_as'),
                            experiment.get('description'))
        # ctl_id = get_control_id(experiment)
        # if ctl_id:
        #   print "Control %s" %(ctl_id)
        # else:
        #   print "Found no control ... skipping %s" %(exp_id)
        #   continue
        # (rep1_ta,rep1_pe), (rep2_ta,rep2_pe) = get_exp_tas(experiment, server, keypair, args.project, args.inf)
        # (ctl1_ta,ctl1_pe), (ctl2_ta,ctl2_pe) = get_ctl_tas(experiment, server, keypair, args.project, args.inf)

        tas = get_tas(experiment, server, keypair, args.project, args.inf)
        if not tas:
            logging.error('Failed to resolve all tagaligns for %s' %
                          (experiment['accession']))
            continue

        pprint.pprint(tas)
        # sys.exit()
        #continue

        skip_flag = False
        for key, value in tas.iteritems():
            if not value:
                logging.error('Missing %s ... skipping' % (key))
                skip_flag = True
        if skip_flag:
            continue

        workflow_title = '%s Peaks' % (exp_id)
        if args.tag:
            workflow_title += ' %s' % (args.tag)
        outf = args.outf

        if not outf.startswith('/') and outf != '/':
            outf = '/' + outf
        if not outf.endswith('/') and outf != '/':
            outf += '/'
        outf += '%s/peaks/' % (exp_id)

        try:
            investigated_as = target['investigated_as']
        except:
            print "%s: Failed to determine target type ... skipping" % (exp_id)
            continue
        else:
            print investigated_as

        rep1_pe = tas['rep1_ta']['paired_end']
        rep2_pe = tas['rep2_ta']['paired_end']

        if None in [rep1_pe, rep2_pe]:
            print "%s: Cannot determine paired end: rep1 PE = %s, rep2 PE = %s ... skipping" % (
                exp_id, rep1_pe, rep2_pe)
            continue
        if rep1_pe != rep2_pe:
            print "%s: rep1 PE %s differs from rep2 PE %s ... skipping" % (
                exp_id, rep1_pe, rep2_pe)
            continue

        if any('histone' in target_type for target_type in investigated_as):
            print "Found to be histone.  No blacklist will be used."
            IDR_default = False
            workflow_spinner = '~/chip-seq-pipeline/dnanexus/histone_workflow.py'
            blacklist = None
        else:
            print "Assumed to be tf"
            IDR_default = True
            workflow_spinner = '~/chip-seq-pipeline/dnanexus/tf_workflow.py'
            if args.assembly == "hg19":
                blacklist = "ENCODE Reference Files:/hg19/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed.gz"
            else:
                print "WARNING: No blacklist known for assembly %s, proceeding with no blacklist" % (
                    args.assembly)
                blacklist = None

        run_command = \
            '%s --title "%s" --outf "%s" --nomap --yes ' % (workflow_spinner, workflow_title, outf) + \
            '--rep1pe %s --rep2pe %s ' % (str(rep1_pe).lower(), str(rep2_pe).lower()) + \
            '--rep1 %s --rep2 %s ' % (tas['rep1_ta'].get('file_id'), tas['rep2_ta'].get('file_id')) + \
            '--ctl1 %s --ctl2 %s ' % (tas['rep1_ta'].get('control_id'), tas['rep2_ta'].get('control_id')) + \
            '--genomesize %s --chrom_sizes "%s"' %(args.gsize, args.csizes)
        if blacklist:
            run_command += ' --blacklist "%s"' % (blacklist)
        if args.debug:
            run_command += ' --debug'
        if args.idr or IDR_default:
            run_command += ' --idr --idrversion %s' % (args.idrversion)

        print run_command
        if args.dryrun:
            logging.info('Dryrun')
        else:
            try:
                subprocess.check_call(run_command, shell=True)
            except subprocess.CalledProcessError as e:
                logging.error("%s exited with non-zero code %d" %
                              (workflow_spinner, e.returncode))
            else:
                print "%s workflow created" % (experiment['accession'])
                logging.debug("patching internal_status to url %s" %
                              (experiment_url))
                r = common.encoded_patch(experiment_url,
                                         keypair,
                                         {'internal_status': 'processing'},
                                         return_response=True)
                try:
                    r.raise_for_status()
                except:
                    logging.error(
                        "Tried but failed to update experiment internal_status to processing"
                    )
                    logging.error(r.text)
示例#40
0
def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.query:
        r = requests.get(args.query,
                         auth=keypair,
                         headers={
                             'content-type': 'application/json',
                             'accept': 'application/json'
                         })
        experiments = r.json()['@graph']
        exp_ids = [e['accession'] for e in experiments]
    elif args.experiments:
        exp_ids = args.experiments
    else:
        exp_ids = args.infile

    logger.info('Checking %d experiments' % (len(exp_ids)))
    for (i, exp_id) in enumerate(exp_ids):
        exp_id = exp_id.strip()
        #logger.info('%s' %(exp_id))

        url = urlparse.urljoin(server, '/experiments/%s' % (exp_id))
        experiment_object = common.encoded_get(url, keypair)
        original_files = [
            common.encoded_get(urlparse.urljoin(server, '%s' % (uri)), keypair)
            for uri in experiment_object.get('original_files')
        ]
        bams = [
            f for f in original_files if f.get('file_format') == 'bam'
            and f.get('status') not in ['revoked', 'deleted', 'replaced']
        ]
        fastqs = [
            f for f in original_files if f.get('file_format') == 'fastq'
            and f.get('status') not in ['revoked', 'deleted', 'replaced']
        ]
        for f in fastqs:
            f['replicate'] = common.encoded_get(
                urlparse.urljoin(server, '%s' % (f.get('replicate'))), keypair)
        for bam in bams:
            bioreps = common.biorep_ns(bam.get('accession'), server, keypair)
            if len(bioreps) != 1:
                logger.error(
                    "Expected to find 1 biorep for bam %s, found %s.  Skipping."
                    % (bam.get('accession'), bioreps))
                continue
            else:
                bam_biorep = bioreps[0]
            try:
                derived_from = [
                    common.encoded_get(urlparse.urljoin(server, '%s' % (uri)),
                                       keypair)
                    for uri in bam.get('derived_from')
                ]
            except:
                derived_from = None
            if not derived_from:
                logger.error('bam %s is derived from nothing. Skipping' %
                             (bam.get('accession')))
                continue
            for f in derived_from:
                if f.get('output_category') == 'reference':
                    continue
                if f.get('file_format') != 'fastq':
                    logger.error(
                        "bam %s appears to be derived from non-fastq %s. Continuing with other derived_from files."
                        % (bam.get('accession'), f.get('accession')))
                    continue
                try:
                    if common.after(f.get('date_created'),
                                    bam.get('date_created')):
                        logger.error(
                            "Date conflict. Bam %s is derived from newer Fastq %s"
                            % (bam.get('accession'), f.get('accession')))
                except:
                    logger.error(
                        "Cannot compare bam date %s with fastq date %s. Continuing with other derived_from files."
                        % (bam.get('date_created'), f.get('date_created')))
                    continue
            for f in fastqs:
                if f.get('replicate').get(
                        'biological_replicate_number') == bam_biorep:
                    if common.after(f.get('date_created'),
                                    bam.get('date_created')):
                        logger.info(
                            "bam %s is out-of-date.  fastq %s is newer" %
                            (bam.get('accession'), f.get('accession')))
                        if re.search('control',
                                     experiment_object.get('target').lower()):
                            logger.info(
                                "WARNING, %s is a control experiment so many other experiments may be out-of-date."
                                % (experiment_object.get('accession')))