Пример #1
0
def test_chromsizes():
    assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, 'dm3', mysql='wrong path')
    assert_raises(ValueError, pybedtools.get_chromsizes_from_ucsc, 'dm3', timeout=0)
    try:

        print pybedtools.chromsizes('dm3')
        print pybedtools.get_chromsizes_from_ucsc('dm3')
        assert pybedtools.chromsizes('dm3') == pybedtools.get_chromsizes_from_ucsc('dm3')

        hg17 = pybedtools.chromsizes('hg17')

        assert hg17['chr1'] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn='hg17.genome')
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        assert_raises(OSError,
                      pybedtools.get_chromsizes_from_ucsc, 
                      **dict(genome='hg17', mysql='nonexistent'))

        os.unlink('hg17.genome')
    except OSError:
        sys.stdout.write("mysql error -- test for chromsizes from UCSC didn't run")
Пример #2
0
def generate_sample_input_files(analysis, matrix):
    """Generate input files (BAM, peaks) for a sample depending on its data type."""
    if analysis.data_type in REGION_BASED_DATA_TYPES:
        chrom_sizes_file = tempfile.NamedTemporaryFile().name
        pybedtools.get_chromsizes_from_ucsc(genome=analysis.genome,
                                            saveas=chrom_sizes_file)

        if not hasattr(analysis, "sites"):
            analysis.load_data(only_these_keys=["sites"], permissive=True)
        if not hasattr(analysis, "sites"):
            raise AttributeError(
                "Need a consensus peak set to generate sample input files.")

    for sample in analysis.samples:
        if hasattr(sample, "aligned_filtered_bam"):
            if sample.aligned_filtered_bam is not None:
                d = os.path.dirname(sample.aligned_filtered_bam)
                os.makedirs(d, exist_ok=True)
                generate_bam_file(
                    matrix.loc[:, sample.name],
                    sample.aligned_filtered_bam,
                    genome_assembly=analysis.genome,
                    chrom_sizes_file=chrom_sizes_file,
                )
        if hasattr(sample, "peaks"):
            if sample.peaks is not None:
                d = os.path.dirname(sample.peaks)
                os.makedirs(d, exist_ok=True)
                generate_peak_file(analysis.sites,
                                   sample.peaks,
                                   summits=False,
                                   genome_assembly=analysis.genome)
        if hasattr(sample, "summits"):
            if sample.summits is not None:
                d = os.path.dirname(sample.summits)
                os.makedirs(d, exist_ok=True)
                generate_peak_file(analysis.sites,
                                   sample.summits,
                                   summits=True,
                                   genome_assembly=analysis.genome)

        if hasattr(sample, "log2_read_counts"):
            if sample.log2_read_counts is not None:
                for res, file in sample.log2_read_counts.items():
                    os.makedirs(os.path.dirname(file), exist_ok=True)
                    generate_log2_profiles(
                        (2**matrix[res].loc[:, sample.name]).astype(int),
                        (2**matrix[res].loc[:, sample.name]
                         ).astype(int),  # this should be the background vector
                        file,
                    )
Пример #3
0
def generate_bam_file(count_vector,
                      output_bam,
                      genome_assembly="hg38",
                      chrom_sizes_file=None,
                      index=True):
    """Generate BAM file containing reads matching the counts in a vector of features"""
    s = location_index_to_bed(count_vector.index)

    # get reads per region
    i = [i for i, c in count_vector.iteritems() for _ in range(c)]
    s = s.reindex(i)

    # shorten/enlarge by a random fraction; name reads
    d = s["end"] - s["start"]
    s = s.assign(
        start=(s["start"] +
               d * np.random.uniform(-0.2, 0.2, s.shape[0])).astype(int),
        end=(s["end"] +
             d * np.random.uniform(-0.2, 0.2, s.shape[0])).astype(int),
        name=[
            "{}_read_{}".format(count_vector.name, i)
            for i in range(s.shape[0])
        ],
    )

    s = pybedtools.BedTool.from_dataframe(s).truncate_to_chrom(
        genome=genome_assembly).sort()
    # get a file with chromosome sizes (usually not needed but only for bedToBam)
    if chrom_sizes_file is None:
        chrom_sizes_file = tempfile.NamedTemporaryFile().name
        pybedtools.get_chromsizes_from_ucsc(genome=genome_assembly,
                                            saveas=chrom_sizes_file)
    s.to_bam(g=chrom_sizes_file).saveas(output_bam)

    if index:
        import pysam

        pysam.index(output_bam)
Пример #4
0
def test_chromsizes():
    with pytest.raises(OSError):
        pybedtools.get_chromsizes_from_ucsc("dm3",
                                            mysql="wrong path",
                                            fetchchromsizes="wrongtoo")
    with pytest.raises(ValueError):
        pybedtools.get_chromsizes_from_ucsc("dm3", timeout=0)
    try:

        print(pybedtools.chromsizes("dm3"))
        print(pybedtools.get_chromsizes_from_ucsc("dm3"))
        assert pybedtools.chromsizes(
            "dm3") == pybedtools.get_chromsizes_from_ucsc("dm3")

        hg17 = pybedtools.chromsizes("hg17")

        assert hg17["chr1"] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn="hg17.genome")
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        with pytest.raises(OSError):
            pybedtools.get_chromsizes_from_ucsc(**dict(
                genome="hg17", mysql="nonexistent", fetchchromsizes="missing"))

        os.unlink("hg17.genome")
    except OSError:
        sys.stdout.write(
            "mysql error -- test for chromsizes from UCSC didn't run")
Пример #5
0
def enrichment(id,a, b,background, organism,name=None, score=None, strand=None, n=10, run=[]):
	"""Perform enrichment analysis between two BED files.

	a - path to Feature of Interest BED file (FOI)
	b - path to Genomic Feature BED file (GF)
	n - number of Monte-Carlo iterations
	"""
	write_debug("START",True)
	r = {}
	e = Enrichment_Par(a=a,b=b,organism=organism,n=n,background=background)

	if os.path.exists(e.background):
		e = e.replace(Background = BedTool(e.background))

	e = e._replace(A = BedTool(str(e.a)))
	e = e._replace(B = BedTool(str(e.b)))
	e = e._replace(genome = pybedtools.get_chromsizes_from_ucsc(e.organism))
	e = e._replace(genome_fn = pybedtools.chromsizes_to_file(e.genome))

	e = e._replace(organism = str(e.organism))
	flt = make_filter(name,score,strand)
	e = e._replace(B = e.B.filter(flt).saveas())
	e = e._replace(nA = len(e.A))
	e = e._replace(nB = len(e.B))
	# Exits if there are 0 GFs or 0 FOI
	if not e.nA or not e.nB:
		logger.info("Filter resulted in 0 Features of Interest. Terminating Run. {} (id={})".format(b,id))
		return Enrichment(e.a,basename(e.b),e.nA,e.nB,"NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")

	e.A.set_chromsizes(e.genome)
	e.B.set_chromsizes(e.genome)
	e = e._replace(obs = len(e.A.intersect(e.B, u=True)))
	# This is the Monte-Carlo step.  If custom background present, it is used
	if 'pvalue' in run:
		logger.info("Running Monte Carlo ({}): (id={})".format(b,id))
		write_progress(id, "Running Monte Carlo {}".format(b))
		r.update(run_montecarlo(e))
	else:
		r['p_value'], r['exp'] = "NA","NA"
		logger.info("Skipping Monte Carlo ({}): (id={})".format(b,id))
	## Uncomment to print global parameters in debug file
	#write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background,
	#			n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs)
	
	# expected caluclated using pybed method CANNOT use custom background
	if 'pybedtool' in run:
		logger.info("Running Random Intersections ({}): (id={})".format(b,id))
		write_progress(id, "Running Random Intersections: {0}".format(b))
		r.update( run_pybedtool(e))
	else:
		r['pybedp_value'], r['pybed_exp'] = "NA","NA"
		logger.info("Skipping Random Intersections")

	# epected calculated using jaccard method
	if 'jaccard' in run:
		logger.info("Running Jaccard ({}): (id={})".format(e.b,id))
		write_progress(id, "Running Jaccard {}".format(e.b))
		r.update( run_jaccard(e))
	else:
		r['jaccardp_value'], r['jaccard_obs'],r['jaccard_exp'] = "NA","NA","NA"
		logger.info("Skipping Jaccard ({}): (id={})".format(b,id))
	
	# run kolmogorov-smornov test
	if 'kolmogorov' in run:
		logger.info("Running Kolmogorov-Smornov {} (id={})".format(b,id))
		write_progress(id, "Running Kolmogorov-Smornov{}".format(b))
		r.update( run_kolmogorov(e))
	else:
		r['kol_smor_p_value'] = "NA"
		logger.info("Skipping Kolmogorov-Smornov {} (id={})".format(b,id))
	
	# run proximity analysis
	if 'proximity' in run:
		logger.info("Running proximity {} (id={})".format(b,id))
		write_progress(id, "Running proximity analysis{}".format(b))
		r.update( run_proximity(e))
	else:
		logger.info( "Skipping Proximity")
		r['obsprox'],r['expprox'],r['proximityp_value']="NA","NA", "NA" 

	# run hypergeometric distrubtion analysis
	if 'hypergeometric' in run:
		write_progress(id,"Running")
		logger.info("Running hypergeometric analysis {} (id={})".format(b,id))
		r.update(run_hypergeometric(e))
	else:
		logger.info("Skipping hypergeometric")
		r['hypergeometric_p_value'] = "NA"
	## Uncomment to print global parameters in debug file
	#write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background,
	#		n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs)
	# the order of these arguments IS IMPORTANT
	return Enrichment(e.a, basename(e.b), e.nA, e.nB, e.obs, r['exp'], r['p_value'],r['obsprox'],\
			r['expprox'],r['pybedp_value'],r['pybed_exp'],r['jaccard_obs'],r['jaccardp_value'],\
			r['jaccard_exp'],r['proximityp_value'],r['kol_smor_p_value'],r['hypergeometric_p_value'])