def processBed(inbed, col, posF, posS, delimeter, outbasename):
    logging.info("Creating BedTool object.")
    myBedTool = BedTool(inbed)
    logging.info("Retrieving feature length.")
    bedobj = myBedTool.each(addFeatureLengthToScoreCol, col, posF, posS,
                            delimeter).saveas(outbasename + ".bed")
    plotFeaturesLength(bedobj, col, posF, posS, delimeter, outbasename)
Exemplo n.º 2
0
def promoter(bed, n, genome, delimiter, index, out):
    if os.path.exists(out):
        os.remove(out)

    gnm_d = process_genome(genome)
    bedobj = BedTool(bed)
    bedobj.each(check_name_col)
    fnames_list = [f.name.split(delimiter)[index] for f in bedobj]
    previous_g, previous_f, firstLine, obj_idx, nrecords = "", "", True, 0, len(
        bedobj)
    with open(out, 'w') as outfile:
        for f in bedobj:

            gname = get_feature_name(f.name, delimiter, index)
            if obj_idx == len(
                    fnames_list) - 1:  #if last line [index checking will fail]
                if gname != previous_g and f.strand == "+":
                    outfile.write(add_bp_positive_feature(f, n))
                elif f.strand == "+":
                    outfile.write(str(f))
                elif f.strand == "-":
                    outfile.write(add_bp_negative_feature(f, n, gnm_d))

            elif gname != previous_g and f.strand == "+":
                outfile.write(add_bp_positive_feature(f, n))

            elif gname != previous_g and obj_idx != nrecords and fnames_list[
                    obj_idx +
                    1] == gname:  # first feature of multifeature gene in negative strand
                outfile.write(str(f))

            elif gname != previous_g:  # if negative gene has only one feature
                outfile.write(add_bp_negative_feature(f, n, gnm_d))

            elif gname == previous_g and f.strand == "+" or obj_idx != nrecords and fnames_list[
                    obj_idx +
                    1] == gname:  # if positive strand or not last feature of negative gene
                outfile.write(str(f))

            elif gname == previous_g and fnames_list[
                    obj_idx +
                    1] != gname:  #if last feature of a negative strand gene
                outfile.write(add_bp_negative_feature(f, n, gnm_d))

            obj_idx += 1
            previous_g = gname
    outfile.close()
def processIntronicBed(inbed,lengths,outbasename,featureType):
    logging.info("Creating BedTool object.")
    myBedTool = BedTool(inbed)
    logging.info("Writing " + featureType + " length.")
    bedobj=myBedTool.each(addIntronLengthToScoreCol).saveas(outbasename + ".bed")
    plotIntrons(bedobj,outbasename,featureType,0)
    for l in lengths:
        smallIntObj=bedobj.each(filterByLength,l).saveas(outbasename + "_smaller" + str(l) + ".bed")
        plotIntrons(smallIntObj,outbasename,featureType,l)
Exemplo n.º 4
0
def get_merged_exons(genes, gtf, genome_fasta, strand):
    '''
    get all exons from specified genes, merging any
    overlapping exonic regions, also return their
    respective sequences in a dictionary object
    '''
    gene_gtf = gtf[gtf.gene.isin(genes)]
    if len(gene_gtf) == 0:
        return pd.DataFrame(), {}
    gene_gtf = gene_gtf.drop('gene', axis=1)
    gene_strand = gene_gtf.strand.values[0]
    strand = gene_strand if strand == '' else strand

    with tempfile.NamedTemporaryFile(mode='r+') as temp_gtf:
        gene_gtf.to_csv(temp_gtf.name, index=False, header=False, sep='\t')

        # load gene GTF info, extract and merge exons
        g = BedTool(temp_gtf.name)
        exons = BedTool(subset_featuretypes(g, 'exon'))
        exons = exons.remove_invalid().sort().merge()

        exseq = exons.each(add_strand, strand)
        exseq = exseq.sequence(fi=genome_fasta, s=True)
        block_seqs = get_block_seqs(exseq)

        blocks = pd.DataFrame()
        with tempfile.NamedTemporaryFile(mode='r+') as temp_exons:
            exons.saveas(temp_exons.name)
            blocks = pd.read_csv(temp_exons,
                                 header=None,
                                 sep='\t',
                                 names=['chr', 'start', 'end'])

        if type(genes) == str:
            blocks['name'] = genes
        else:
            blocks['name'] = genes[0]  # use first gene as representative

        blocks['score'] = '.'
        blocks['strand'] = strand

    blocks['chr'] = blocks['chr'].map(str)
    blocks.start = blocks.start.map(int)
    blocks.end = blocks.end.map(int)

    block_names = []
    # reverse numbering if the gene is on the reverse strand
    if gene_strand == '-':
        block_names = [
            '|' + str(i) for i in reversed(range(1,
                                                 len(blocks) + 1))
        ]
    else:
        block_names = ['|' + str(i) for i in range(1, len(blocks) + 1)]
    blocks['name'] = blocks['name'] + block_names

    return (blocks, block_seqs)
Exemplo n.º 5
0
elif args.verbose:
    logging.basicConfig(level=logging.INFO,
                        format="%(filename)s - %(levelname)s - %(message)s")
else:
    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
logging.info("Parsed arguments:")
if args.outfile:
    logging.info("  outfile: enabled writing to file")
    logging.info("  outfile: '{}'".format(args.outfile))
logging.info("  outfile: '{}'".format(args.outfile))
logging.info("")

# data processing
alns = BedTool(args.infile)
# select either from 5' or 3'-end
if args.threeprime:
    clnts = alns.each(three_prime, upstream=0, downstream=1)
else:
    clnts = alns.each(five_prime, upstream=1, downstream=0)

# write to file or to stdout
if args.outfile:
    clnts.saveas(args.outfile)
else:
    tmptool = clnts.saveas()
    logging.debug("results written to temporary file :" + tmptool.fn)
    tmp = open(tmptool.fn)
    for line in tmp:
        stdout.write(line)
    tmp.close()
Exemplo n.º 6
0
# first flank the introns by $flank either side

def flank_introns(feature):
	feature.start = feature.start - int(flank)
	feature.end = feature.end + int(flank)
	return(feature)

#if not os.path.exists(os.path.dirname(outFile)):
#	os.mkdir(os.path.dirname(outFile))

#if not outFile.endswith("csv"):
#	outFile = outFile + "_" + str(flank) + "_coverage.csv"

# flank each intron by 100 to capture exons either side
flanked = introns.each(flank_introns)

# intersect the flanked introns with the iCLIP clusters
#intersect = introns.intersect(clusters, s=True,wa=True, wb = True)
intersect = flanked.intersect(clusters, s=stranded,wa=True, wb = True)

print type(intersect)

# store as dictionary
intervals = dict()
for feature in intersect:
		coord = str(feature.chrom) + ":" + str(feature.start) + "-" + str(feature.stop)
		length = int(feature.stop) - int(feature.start)
		strand = str(feature.strand)
		start_pos = int(feature.start)
		if coord not in intervals:
Exemplo n.º 7
0
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
elif args.verbose:
    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
else:
    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
logging.info("Parsed arguments:")
if args.outfile:
    logging.info("  outfile: enabled writing to file")
    logging.info("  outfile: '{}'".format(args.outfile))
logging.info("  outfile: '{}'".format(args.outfile))
logging.info("")

# data processing
alns = BedTool(args.infile)
# select either from 5' or 3'-end
if args.threeprime:
    clnts = alns.each(three_prime, upstream=0, downstream=1)
else:
    clnts = alns.each(five_prime, upstream=1, downstream=0)

# write to file or to stdout
if args.outfile:
    clnts.saveas(args.outfile)
else:
    tmptool = clnts.saveas()
    logging.debug("results written to temporary file :" + tmptool.fn)
    tmp = open(tmptool.fn)
    for line in tmp:
        stdout.write(line)
    tmp.close()
Exemplo n.º 8
0
        feature.strand = "+"
        return (feature)
    elif feature.strand == "+":
        feature.end = feature.start
        feature.start = feature.start - 10
        # switch strand
        feature.strand = "-"
        return (feature)
    else:
        print "feature must be + or - "
    return (feature)


print("getting flanking coordinates")

downstream = clusters.each(get_downstream_sequence)

# get fasta sequence for each coordinate

print("retrieving genomic sequences")
fasta = downstream.sequence(fi=reference, s=True, tab=True)

# apply tests. 1. is there a stretch of 6*A or are there 7 A in any order?
print("testing each cluster for A-rich sequence")

# random_primers = []
# test = []
# with open(fasta.seqfn, "r") as f:
# 	for line in f:
# 		seq = line.strip().upper().split("\t")[1]
# 		if "AAAAAA" in seq or seq.count("A") >= 7:
Exemplo n.º 9
0
        for fup, core, fdown in izip(fup_reader, core_reader, fdown_reader):
            assert fup[0] == core[0] == fdown[
                0], "Error: sequence ids of cores and flanks don't match."
            # setup fasta headers and sequences
            fa_header = ">" + core[0]
            seq_viewpoint = fup[1].lower() + core[1].upper() + fdown[1].lower()
            # seq_normal = fup[1].upper() + core[1].upper() + fdown[1].upper()

            viewpointfa.write(fa_header + "\n")
            viewpointfa.write(seq_viewpoint + "\n")
    viewpointfa.close()


# prepare input coordinates
bsites = BedTool(args.bsites_fn).sort().saveas()
centers = bsites.each(midpoint).saveas()

# prepare positive instances
logging.info("preparing positive instances")
if (args.chromosome_limits):
    logging.debug("using chromosome_limits " + args.chromosome_limits)
    cores = centers.slop(
        s=True,
        l=int(args.core_length / 2),
        # -1 to account for the center nucleotide!
        r=int(args.core_length / 2) + (args.core_length % 2) - 1,
        g=args.chromosome_limits).each(offset_zero_by_one).saveas(
            pos_core_bed_fn)
else:
    cores = centers.slop(
        s=True,
Exemplo n.º 10
0
        core_reader = reader(core_tabseq, delimiter="\t")
        fdown_reader = reader(fdown_tabseq, delimiter="\t")
        for fup, core, fdown in izip(fup_reader, core_reader, fdown_reader):
            assert fup[0] == core[0] == fdown[0], "Error: sequence ids of cores and flanks don't match."
            # setup fasta headers and sequences
            fa_header = ">" + core[0]
            seq_viewpoint = fup[1].lower() + core[1].upper() + fdown[1].lower()
            # seq_normal = fup[1].upper() + core[1].upper() + fdown[1].upper()

            viewpointfa.write(fa_header + "\n")
            viewpointfa.write(seq_viewpoint + "\n")
    viewpointfa.close()

# prepare input coordinates
bsites = BedTool(args.bsites_fn).sort().saveas()
centers = bsites.each(midpoint).saveas()

# prepare positive instances
logging.info("preparing positive instances")
if (args.chromosome_limits):
    logging.debug("using chromosome_limits " + args.chromosome_limits)
    cores = centers.slop(s=True,
                         l=int(args.core_length / 2),
                         # -1 to account for the center nucleotide!
                         r=int(args.core_length / 2) +
                         (args.core_length % 2) - 1,
                         g=args.chromosome_limits).each(offset_zero_by_one).saveas(pos_core_bed_fn)
else:
    cores = centers.slop(s=True,
                         l=int(args.core_length / 2),
                         # -1 to account for the center nucleotide!
Exemplo n.º 11
0
def subset(bed, n, keep_small, out):
    print("Applying subset")
    bedobj = BedTool(bed)
    bedobj.each(apply_subset, n, keep_small).saveas(out)
Exemplo n.º 12
0
#!/usr/bin/env python
import os
import sys
import subprocess
sys.path.insert(0,'/mnt/lustre/home/cusanovich/Programs/lib/python2.6/site-packages/pybedtools-0.6.2-py2.6-linux-x86_64.egg/pybedtools')
from pybedtools import BedTool, featurefuncs

windowsize = 10000
windowname = str(windowsize/1000) + 'kb'
indir = '/mnt/lustre/home/cusanovich/Kd_Arrays/GenomeAnnotations/StartAnnots/'
outdir = '/mnt/lustre/home/cusanovich/Kd_Arrays/GenomeAnnotations/FinalAnnots/'
annots = ['GSE31388_eQtlTable_cleaned.bed','sorted_PritchardQTLs_merged.bed','gwascatalog_ucsc_merged.bed']
#annots = ['GSE31388_eQtlTable_cleaned.bed']
jacked = BedTool('/mnt/lustre/home/cusanovich/centipede/hg19_jack_centipede_sorted_pwms_clean.bed')
tss = BedTool('/mnt/lustre/home/cusanovich/Kd_Arrays/Centipede/Annotation/HT12ensemblTSScombinedsorted.bed')

for annot in annots:
	print annot
	currannot = BedTool(indir + annot)
	currout = annot.split('.')[0]
	print 'Intersecting...'
	inter = jacked.intersect(currannot,wa=True,wb=True).moveto(outdir + windowname + '_' + currout + '_centipede_intersect.bed')
	inter = BedTool(outdir + windowname + '_' + currout + '_centipede_intersect.bed')
	print 'Calculating midpoints...'
	intermid = inter.each(featurefuncs.midpoint).moveto(outdir + windowname + '_' + currout + '_centipede_intersect_midpoint.bed')
	print 'Finding TSSs...'
	inter = BedTool(outdir + windowname + '_' + currout + '_centipede_intersect_midpoint.bed')
	outter = tss.window(intermid,w=windowsize).moveto(outdir + windowname + '_' + currout + '_insite.bed')
Exemplo n.º 13
0
def main(use_config=True,
         bed1=None,
         bed2=None,
         method=None,
         tempdir=None,
         md=None,
         largewindow=None,
         scanner=None,
         debug=False,
         label1=None,
         label2=None,
         jobid=None):
    '''This is the main script of the combine function that is called within
        TFEA. Default arguments are assigned to variables within config.vars.

    Parameters
    ----------
    use_config : boolean
        Whether to use a config module to assign variables.
    bed1 : list
        A list of strings specifying full paths to bed files corresponding to
        a single condition (replicates)
    bed2 : list
        A list of strings specifying full paths to bed files corresponding to
        a single condition (replicates)
    method : str
        Method for combining input bed files into a single bed file
    tempdir : str
        Full path to a directory where files will be saved
    md : boolean
        Whether md-score bed files are generated
    largewindow : int
        Half-length of window size to use when generating md-score related
        bed files
    scanner : str
        Scanner method to use in SCANNER module. Only needed if md also
        specified. If equal to 'genome hits', md bed files generated will be 
        only contain one base and be centered at the middle of the region

    Returns
    -------
    None - Assigns varaibles within config if use_config set to True

    Raises
    ------
    FileEmptyError
        If any resulting file is empty
    '''
    start_time = time.time()
    if use_config:
        bed1 = config.vars['BED1']
        bed2 = config.vars['BED2']
        method = config.vars['COMBINE']
        tempdir = config.vars['TEMPDIR']
        md = config.vars['MD']
        md_bedfile1 = config.vars['MD_BEDFILE1']
        md_bedfile2 = config.vars['MD_BEDFILE2']
        largewindow = config.vars['LARGEWINDOW']
        scanner = config.vars['SCANNER']
        label1 = config.vars['LABEL1']
        label2 = config.vars['LABEL2']
        debug = config.vars['DEBUG']
        jobid = config.vars['JOBID']

    print("Combining Regions...", end=' ', flush=True, file=sys.stderr)

    if md_bedfile1 and md_bedfile2:
        centered_md_bedfile1 = tempdir / 'md_bedfile1.centered.bed'
        centered_md_bedfile2 = tempdir / 'md_bedfile2.centered.bed'
        md = md and (not md_bedfile1 or not md_bedfile2
                     )  #Boolean to determine whether to generate MD bed files
        md_pybedtool1 = BedTool(str(md_bedfile1))
        md_pybedtool1.each(center_feature).each(
            extend_feature,
            size=largewindow).remove_invalid().saveas(centered_md_bedfile1)
        md_pybedtool2 = BedTool(str(md_bedfile2))
        md_pybedtool2.each(center_feature).each(
            extend_feature,
            size=largewindow).remove_invalid().saveas(centered_md_bedfile2)
        if use_config:
            config.vars['MD_BEDFILE1'] = centered_md_bedfile1
            config.vars['MD_BEDFILE2'] = centered_md_bedfile2

    #Use MuMerge to merge bed files
    if method == 'mumerge':
        mumerge_input = tempdir / 'mumerge_input.txt'
        combined_file = tempdir / 'combined_file.mumerge'
        #Write MuMerge input file
        # with open(mumerge_input, 'w') as F:
        #     F.write("#file\tsampid\tgroup\n")
        #     for i,bedpath in enumerate(bed1, 1):
        #         F.write(f'{bedpath}\t{label1}{i}\t{label1}\n')
        #     for i,bedpath in enumerate(bed2, 1):
        #         F.write(f'{bedpath}\t{label2}{i}\t{label2}\n')

        #MuMerge Command - output to combined_file.mumerge.bed
        combined_file = mumerge(mumerge_input,
                                combined_file,
                                bed1=bed1,
                                bed2=bed2,
                                label1=label1,
                                label2=label2)
        clean_combined_file = tempdir / 'combined_file.mumerge.clean.bed'
        combined_pybedtool = BedTool(str(combined_file))
        combined_pybedtool.remove_invalid().saveas(clean_combined_file)
        combined_file = clean_combined_file
        # combined_file = Path(str(combined_file) + '_MUMERGE.bed')

        #Perform simple merge same as merge all for md bed files
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.mumerge"
            md_mumerge_input1 = tempdir / "md_mumerge_input1.txt"
            md_bedfile1 = mumerge(md_mumerge_input1,
                                  md_bedfile1,
                                  bed1=bed1,
                                  label1=label1,
                                  label2=label2)
            md_pybedtool1 = BedTool(str(md_bedfile1))
            md_bedfile1 = tempdir / "md_bedfile1.mumerge.final.bed"
            md_pybedtool1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            md_bedfile2 = tempdir / "md_bedfile2.mumerge"
            md_mumerge_input2 = tempdir / "md_mumerge_input2.txt"
            md_bedfile2 = mumerge(md_mumerge_input2,
                                  md_bedfile2,
                                  bed2=bed2,
                                  label1=label1,
                                  label2=label2)
            md_pybedtool2 = BedTool(str(md_bedfile2))
            md_bedfile2 = tempdir / "md_bedfile2.mumerge.final.bed"
            md_pybedtool2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)

            # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4)
            # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4)
            # md_merged_bed1.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_merged_bed2.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2)

    #Merge all bed regions, for MD merge condition replicates
    elif method == 'mergeall':
        combined_file = tempdir / "combined_file.mergeall.bed"
        merged_bed = merge_bed(beds=bed1 + bed2)
        # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.merge.bed"
            md_bedfile2 = tempdir / "md_bedfile2.merge.bed"
            # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_merged_bed1 = merge_bed(beds=bed1).each(
                featurefuncs.extend_fields, 4)
            md_merged_bed2 = merge_bed(beds=bed2).each(
                featurefuncs.extend_fields, 4)
            md_merged_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_merged_bed1.saveas(md_bedfile1)
            md_merged_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_merged_bed2.saveas(md_bedfile2)

    elif method == 'tfitclean':
        # combined_file = tfit_clean(beds=bed1+bed2, tempdir=tempdir)
        combined_file = tempdir / "combined_file.tfitclean.bed"
        size_cut = 200
        cleaned_bed = clean_bed(beds=bed1 + bed2, size_cut=size_cut)
        # cleaned_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        cleaned_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.clean.bed"
            md_bedfile2 = tempdir / "md_bedfile2.clean.bed"
            md_cleaned_bed1 = clean_bed(beds=bed1)
            md_cleaned_bed2 = clean_bed(beds=bed2)
            # md_cleaned_bed1.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1)
            md_cleaned_bed1.saveas(combined_file)
            # md_cleaned_bed2.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2)
            md_cleaned_bed2.saveas(combined_file)

    #Intersect all bed regions, for MD intersect condition replicates
    elif method == 'intersectall':
        combined_file = tempdir / 'combined_file.intersectall.bed'
        intersected_bed = intersect_bed(beds=bed1 + bed2)
        # intersected_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        intersected_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.intersect.bed"
            md_bedfile2 = tempdir / "md_bedfile2.intersect.bed"
            md_intersected_bed1 = intersect_bed(beds=bed1)
            md_intersected_bed2 = intersect_bed(beds=bed2)
            md_intersected_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_intersected_bed1.saveas(combined_file)
            md_intersected_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_intersected_bed2.saveas(combined_file)

    #Merge all regions, filter small regions. For MD perform this for each condition
    elif method == 'tfitremovesmall':
        # combined_file = tfit_remove_small(beds=bed1+bed2, tempdir=tempdir)
        size_cut = 200
        combined_file = tempdir / "combined_file.mergeallnosmall.bed"
        merged_bed = merge_bed(beds=bed1 + bed2)
        # merged_bed.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.filter(lambda b: b.stop - b.start > size_cut).saveas(
            combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.merge.bed"
            md_bedfile2 = tempdir / "md_bedfile2.merge.bed"
            md_merged_bed1 = merge_bed(beds=bed1)
            md_merged_bed2 = merge_bed(beds=bed2)
            # md_merged_bed1.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1)
            md_merged_bed1.filter(
                lambda b: b.stop - b.start > size_cut).saveas(combined_file)
            # md_merged_bed2.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2)
            md_merged_bed2.filter(
                lambda b: b.stop - b.start > size_cut).saveas(combined_file)

    #Intersect replicates, merge conditions. For MD intersect condition replicates
    elif method == 'intersect/merge':
        # combined_file = intersect_merge_bed(bed1=bed1, bed2=bed2, tempdir=tempdir)
        combined_file = tempdir / 'combined_file.intermerge.bed'
        intersected_bed1 = intersect_bed(beds=bed1)
        intersected_bed2 = intersect_bed(beds=bed2)
        merged_bed = intersected_bed1.cat(intersected_bed2).merge().sort()
        # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file)
        merged_bed.remove_invalid().saveas(combined_file)
        if md:
            md_bedfile1 = tempdir / "md_bedfile1.intersect.bed"
            md_bedfile2 = tempdir / "md_bedfile2.intersect.bed"
            md_intersected_bed1 = intersect_bed(beds=bed1).each(
                featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_intersected_bed2 = intersect_bed(beds=bed2).each(
                featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1')
            md_intersected_bed1.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile1)
            # md_intersected_bed1.saveas(md_bedfile1)
            md_intersected_bed2.each(center_feature).each(
                extend_feature,
                size=largewindow).remove_invalid().saveas(md_bedfile2)
            # md_intersected_bed2.saveas(md_bedfile2)

    else:
        raise exceptions.InputError("Error: COMBINE option not recognized.")

    #Check to make sure no files are empty
    if os.stat(combined_file).st_size == 0:
        raise exceptions.FileEmptyError(
            "Error in COMBINE module. Resulting bed file is empty.")

    if md:
        if os.stat(md_bedfile1).st_size == 0 or os.stat(
                md_bedfile2).st_size == 0:
            raise exceptions.FileEmptyError(
                "Error in COMBINE module. Resulting md bed file is empty.")
        if use_config:
            #Assign MD_BEDFILE variables in config
            config.vars['MD_BEDFILE1'] = md_bedfile1
            config.vars['MD_BEDFILE2'] = md_bedfile2

    #Assign COMBINED_FILE variable in config
    if use_config:
        config.vars['COMBINED_FILE'] = combined_file

    #Record time, print
    total_time = time.time() - start_time
    if use_config:
        config.vars['COMBINEtime'] = total_time
    print("done in: " + str(datetime.timedelta(seconds=int(total_time))),
          ". Processing",
          len(combined_file.read_text().split('\n')),
          "regions",
          file=sys.stderr)

    if debug:
        multiprocess.current_mem_usage(jobid)
Exemplo n.º 14
0
def newber(binner,factor):
	"""Returns bed record with all binding for a factor merged and renamed."""
	newbie = BedTool(binner).sort().merge(nms=True).each(featurefuncs.midpoint)
	newbie = newbie.each(featurefuncs.rename,factor)
	return(newbie)