Exemplo n.º 1
0
def annotate_peaks(peaks, ref_path):
    """
    peak to gene annotation strategy:
        1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak
        2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak
        3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak
            This step is optional
        4. call it an intergenic peak
    """

    ref_mgr = ReferenceManager(ref_path)
    tss = BedTool(ref_mgr.tss_track)

    # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites
    if tss.field_count() == 7:
        tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
    else:
        df_tss = tss.to_dataframe()
        df_tss['gene_type'] = '.'
        tss_filtered = BedTool.from_dataframe(df_tss).saveas()

    # including transcripts.bed is optional
    if ref_mgr.transcripts_track is None:
        transcripts_filtered = BedTool([])
    else:
        transcripts = BedTool(ref_mgr.transcripts_track)
        if transcripts.field_count() == 7:
            transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
        else:
            df_tx = transcripts.to_dataframe()
            df_tx['gene_type'] = '.'
            transcripts_filtered = BedTool.from_dataframe(df_tx).saveas()

    # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns
    peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas()

    results = []
    peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas()

    # avoid error when no peaks overlap with any transcipts
    if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss):
        peaks_nearby_tss_and_tx = peaks_nearby_tss \
            .intersect(transcripts_filtered, wa=True, wb=True) \
            .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct'])

        for peak in peaks_nearby_tss_and_tx:
            results.append(get_peak_nearby_genes(peak))

    for peak in peaks_nearby_tss_butno_tx:
        results.append(get_peak_nearby_genes(peak))

    return results
Exemplo n.º 2
0
def filter_bed(bedfile, snp_list, outfile=sys.stdout):
    """Filter a bedfile to only include snps in snp_list, print to outfile.

    :bedfile:  A bed file of all the SNPs, can be gzipped.
    :snp_list: List/tuple/set/frozenset of snp names.
    :outfile:  Something .bed or .bed.gz, deault STDOUT.
    :returns:  0 on success 1 on failure

    """
    try:
        from pybedtools import BedTool
    except ImportError:
        logme.log('pybedtools is not installed.\n' +
                  'Please install and try again. You can get it from here:\n' +
                  'https://github.com/daler/pybedtools',
                  level='error')
        return -1

    if not isinstance(snp_list, (tuple, list, set, frozenset)):
        raise Exception('snp_list must be tuple/list/set/frozenset ' +
                        'it is: {}'.format(type(snp_list)))

    bed      = BedTool(bedfile)
    filtered = bed.filter(lambda a: a.name in snp_list)

    with open_zipped(outfile, 'w') as fout:
        fout.write(str(filtered))
Exemplo n.º 3
0
def clean_bed(beds=None, size_cut=None):
    '''This function separates a list of beds into small and large regions based
        on a size_cut, intersects small regions, merges large regions, and then
        merges the small and large regions.

    Parameters
    ----------
    beds : list or array
        full paths to bed files (python Path objects from pathlib)

    size_cut : int
        cutoff value to separate large and small regions

    Returns
    -------
    clean_bed : BedTool object 
        resulting clean bed object 
    '''
    small_regions = list()
    large_regions = list()
    for bed in beds:
        bed = BedTool(bed)
        small_regions.append(bed.filter(lambda b: b.stop - b.start < size_cut))
        large_regions.append(bed.filter(lambda b: b.stop - b.start > size_cut))
    small_bed = intersect_bed(beds=small_regions)
    large_bed = merge_bed(beds=large_regions)
    clean_bed = large_bed.cat(small_bed).merge().sort()

    return clean_bed
Exemplo n.º 4
0
def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('peaks', help='peaks bed')
    p.add_argument('exons', help='refseq exons from UCSC')
    p.add_argument('gtf', help='refseq gtf with feature of interest')
    p.add_argument('feature', help='feature of interest in the gtf')
    p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
    args = p.parse_args()
    
    if args.verbose: sys.stderr.write(">> building exon library...\n")
    exon_lib = make_exon_lib(args.exons)
    
    peaks = BedTool(args.peaks)
    exons = BedTool(args.exons)
    full_ref = BedTool(args.gtf)
    
    if args.verbose: sys.stderr.write(">> filtering for feature...\n")
    filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
    
    if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
    exonic_peaks = peaks.intersect(exons, wo=True)
    
    if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
    # D for distance (returns negative if upstream)
    for peak in exonic_peaks.closest(filtered_ref, D="a"):
        try:
            p = ComplexLine(peak)
            corrected_distance = 0.0
            total_exon_length = 0.0
            # parse gtf attrs
            gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')

            # looking downstream wrt peak
            if p.gtfdistance > 0:
                # exon with peak
                corrected_distance = p.exonstop - p.peakstop
                for exon in exon_lib[p.exoninfo.name]:
                    # add downstream exon lengths
                    if exon > p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
                        
            # looking upstream wrt peak
            else:
                # exon with peak
                corrected_distance = p.peakstart - p.exonstart
                for exon in exon_lib[p.exoninfo.name]:
                    # add upstream exon lengths
                    if exon < p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
            
            for exon in exon_lib[p.exoninfo.name]:
                total_exon_length += exon_lib[p.exoninfo.name][exon]
            
            # fraction
            print (corrected_distance / total_exon_length)
        
        except ValueError:
            continue
Exemplo n.º 6
0
def getCDSs(bedfilename, reffilename, strand):
    """
    return iterator of coding sequences
    """
    bed = BedTool(bedfilename)
    bed = bed.filter(lambda x: x.strand == strand)
    fasta = reffilename
    bed = bed.sequence(fi=fasta, s=True)
    return SeqIO.parse(bed.seqfn, "fasta")
Exemplo n.º 7
0
def getCDSs(bedfilename, reffilename, strand):
    """
    return iterator of coding sequences
    """
    bed = BedTool(bedfilename)
    bed = bed.filter(lambda x: x.strand == strand)
    fasta = reffilename
    bed = bed.sequence(fi=fasta, s=True)
    return SeqIO.parse(bed.seqfn, "fasta")
 def window_genome(window_width_, filtered_save_name):
     if genome_size_file is not None:
         genome_windowed = BedTool().window_maker(g=genome_size_file,
                                                  w=window_width_)
         genome_windowed.saveas(filtered_save_name)
     else:
         genome_windowed = BedTool().window_maker(genome=genome,
                                                  w=window_width_)
         genome_windowed = genome_windowed.filter(
             lambda p: p.chrom in valids)
         genome_windowed.saveas(filtered_save_name)
Exemplo n.º 9
0
def filterReadsByLength(inbam, minlength, maxlength):
    '''
    Takes a bam file and selects intervals that are within the defined lengths.
    Input: bam file and min/max lengths
    Output: bedTool
    '''
    # convert bam to bed
    intervals = BedTool(inbam).bam_to_bed()
    filt = intervals.filter(lambda x: len(x) > minlength and len(x) < maxlength).saveas()
    # print filt
    return filt
Exemplo n.º 10
0
def filterReadsByLength(inbam, minlength, maxlength):
    '''
    Takes a bam file and selects intervals that are within the defined lengths.
    Input: bam file and min/max lengths
    Output: bedTool
    '''
    # convert bam to bed
    intervals = BedTool(inbam).bam_to_bed()
    filt = intervals.filter(
        lambda x: len(x) > minlength and len(x) < maxlength).saveas()
    # print filt
    return filt
Exemplo n.º 11
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
Exemplo n.º 12
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
Exemplo n.º 13
0
    def _bed(self):
        def by_name(rec):
            # Drop first part before underscore.
            if "_" in self.name:
                name = "_".join(self.name.split("_")[1:])
            else:
                name = self.name
            return (name + "*" in rec.name) or (name == rec.name)

        bt = BedTool(self.path)
        if not self.custom and '_all' not in self.name:
            bt = bt.filter(by_name).saveas()

        if len(bt) > 0 and len(bt[0].fields) > 6:
            bt = bt.bed6().saveas()

        return bt
Exemplo n.º 14
0
class GenomicSubset(object):
    def __init__(self, name, path=paths.genome_subsets, assembly='hg19'):
        self.assembly = assembly
        self.name = name
        self.bedtool = BedTool(path + name + '.bed').sort()

        # Intersect the pathway with the appropriate genome build
        # TODO: this step should be unnecessary if the pathways are correct
        if name != self.assembly:
            self.bedtool = GenomicSubset.reference_genome(
                    self.assembly).bedtool.intersect(self.bedtool).sort().saveas()

    def expand_by(self, expansion_in_each_direction_Mb):
        window_size_str = str(expansion_in_each_direction_Mb) + 'Mb'
        print('total size before window addition:', self.bedtool.total_coverage(), 'bp')

        # compute the flanks
        # TODO: use 1cM instead of 1Mb
        print('computing flanks')
        flanks = self.bedtool.flank(
            genome=self.assembly,
            b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas()

        # compute the union of the flanks and the pathway
        print('computing union')
        union = self.bedtool.cat(flanks, postmerge=False).sort()
        merged = union.merge().saveas()
        print('total size after window addition:', merged.total_coverage(), 'bp')
        self.bedtool = merged

    def restricted_to_chrom_bedtool(self, chrnum):
        return self.bedtool.filter(
                lambda x : x[0] == 'chr' + str(int(chrnum))).saveas()

    @classmethod
    def reference_genome(cls, assembly='hg19'):
        return GenomicSubset(assembly, path=paths.reference, assembly=assembly)

    @classmethod
    def reference_chrom_bedtool(cls, chrnum, assembly='hg19'):
        return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum)

    @classmethod
    def whole_genome(cls, assembly='hg19'):
        return cls(assembly, path=paths.reference)
Exemplo n.º 15
0
def generateBedfileFromBam(outFile, bamFile, delta):
    # for the provided BAM file, generate bed files indicating adequately covered regions
    # print( "bam files given=", args.bamFiles )
    ##TODO check whether bams are indexed and sorted, then perform indexing or sorting if needed
    print("generating adequately covered bed file for ", bamFile)
    bedFile = BedTool(bamFile).genome_coverage(
        bg=True)  # bg=True gives read depth in bed graph format

    filteredBedFile = bedFile.filter(
        lambda x: int(x.name) >= delta
    )  # bedfile object carries the fourthcolumn as attribute "name"  # keep only the regions with read depth >= adequate sample coverage "delta"
    # filteredBedFile.saveas( args.outFile )

    mergedBedFile = filteredBedFile.merge(
    )  # merge adequately covered regions together
    mergedBedFile.saveas(outFile)

    print("bed file generation is done, written to ", outFile)
Exemplo n.º 16
0
def _find_cnv_cpx_redundancies(potentially_clusterable: pybedtools.BedTool,
                               is_carrier: Mapping[Text, numpy.ndarray],
                               min_cpx_reciprocal_overlap: float,
                               cnv_cpx_reciprocal_overlap: float,
                               cnv_cpx_sample_overlap: float) -> Set[Text]:
    """
    Subset potentially clusterable intervals to those that meet required minimum overlap with a CPX event.
    Then find clusters, and remove redundant CNVs from those clusters.
    Parameters
    ----------
    potentially_clusterable: BedTool
        bed object with intervals that could potentially be used for clustering
    is_carrier: Mapping[Text, numpy.ndarray]
        Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
        otherwise (including no-call).
    min_cpx_reciprocal_overlap: float
        Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable.
    cnv_cpx_reciprocal_overlap: float
        Minimum reciprocal overlap between two intervals to be part of a cluster.
    cnv_cpx_sample_overlap: float
        Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant.
    Returns
    -------
    vids_to_remove: Set[Text]
        Set of variant IDs that are redundant and should be removed from the output VCF.
    """
    # find all potentially clusterable intervals that meet required minimum overlap with CPX
    precluster_subset = potentially_clusterable.intersect(
        potentially_clusterable.filter(_is_cpx),
        u=True,
        f=min_cpx_reciprocal_overlap,
        r=True,
        sorted=True,
        nonamecheck=True)

    # find clusters of intervals with high reciprocal overlap, then check each cluster for redundant variant IDs
    return {
        variant_id
        for cluster in _get_clusters(
            precluster_subset,
            min_reciprocal_overlap=cnv_cpx_reciprocal_overlap)
        for variant_id in _get_redundant_cluster_cnv_cpx_vids(
            cluster, is_carrier, cnv_cpx_sample_overlap=cnv_cpx_sample_overlap)
    }
Exemplo n.º 17
0
    def _get_genome_bedtool(self, genome_name, region, genes=None):
        """get the bedtool object for a genome depending on the name and the region"""
        genome = Genome.path_by_name(genome_name)
        mapping = { "any":        "all",
                    "CDS":        "cds",
                    "3prime":     "3_utr",
                    "5prime":     "5_utr",
                    "intron":     "intron",
                    "intergenic": "intergenic" }

        if region not in mapping:
            raise ValueError("Invalid region: %r" % region)
        else:
            bed = BedTool(path.join(genome, "%s.gff" % mapping[region]))

        # Optionally, filter by gene.
        if genes is None or 'all' in genes:
            return bed
        else:
            return bed.filter(lambda x: x.name in genes).saveas()
Exemplo n.º 18
0
    def add_bed(self, bedfile):
        """Add a list of pybedtools Interval objects to self as self.bed.

        Requires pybedtools, adds only records for snps in this individual.

        Note: This is a slow operation.

        :returns: True on success, False on failure.
        """
        try:
            from pybedtools import BedTool
        except ImportError:
            logme.log('add_bed() failed.\n' +
                      'pybedtools is not installed.\n' +
                      'Please install and try again. You can get it from here:\n' +
                      'https://github.com/daler/pybedtools',
                      level='error')
            return False
        bed = BedTool(bedfile)
        self.bed = [i for i in bed.filter(lambda a: a.name in self.snps)]
        return True
Exemplo n.º 19
0
def make_regions(region_size: int, ambiguity_thr: float = 0.5):
    path = os.path.join(
        "/tmp/", f"cached.make_regions({region_size}, {ambiguity_thr}).bed")

    if not os.path.exists(path):
        regions = BedTool().window_maker(w=region_size,
                                         genome="hg19")  # type: BedTool

        # pybedtools is not very consistent when dealing with Interval objects.
        # For instance, str(interval) in some cases will return only
        # 3 fields (chr, start, end).Another time, when fields are specified explicitly, 4 fields and more are
        # printed.It is possible to invoke intersection with '3-field' Intervals and receive mixed Intervals.
        # Intersected are wrongly read as one with strand and name being interval-hit information, and
        # non-intersected are turned into intervals with default additional fields.Workaround is to recreate each
        # interval to include the same number of fields; strand must be included.
        regions = BedTool([Interval(x.chrom, x.start, x.end)
                           for x in regions]).saveas()

        regions = regions.filter(lambda r: r.length == region_size and r.chrom
                                 in hg19.annotation.CHROMOSOME_SIZE).saveas()
        regions = dataset.filter.by_ambiguity(
            regions, BedTool(hg19.annotation.AMBIGUOUS), ambiguity_thr)
        regions.saveas(path, compressed=False)
    return BedTool(path)
Exemplo n.º 20
0
    print("Running in auto mode. Finding region types present in the GTF.")
    df = gtf_ref.to_dataframe().dropna()
    feature_list = list(df.feature.value_counts().index)
else:
    feature_list = list(args.region_list)

print
print("Extracting the following regions: " + str(feature_list))

p = Pool(args.ncores)
p.map(extract_features, feature_list)

if args.do_introns == True:
    print
    print("Extracting intron positions and generating GTF")
    genes = gtf_ref.filter(lambda x: x[2] == 'gene').saveas()
    exons = gtf_ref.filter(lambda x: x[2] == 'exon').saveas()
    introns = genes.subtract(exons, s=True, nonamecheck=True).saveas()
    introns.saveas('gtf_regions/' + args.outfile + '_intron.gtf')

    if args.split_introns == True:
        print
        print(
            "Spliting intron in proximal and distal regions and generating GTF"
        )
        introns_distal = introns.to_dataframe().copy()
        introns_distal.start = introns_distal.start + 500
        introns_distal.end = introns_distal.end - 500

        introns_distal_bed = BedTool.from_dataframe(
            introns_distal).remove_invalid().saveas('gtf_regions/' +
Exemplo n.º 21
0
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, max_length=10000,
                   generate_ID=True, output_name = None):
    """
    Written by Pete Skene ([email protected]). Free for academic use.
    
    - need to install a more up-to-date varsion of bedtools before invoking Jupyter
      type: module load bedtools/2.21.0
    - (1) filters bedgraph based on threshold; (2) merges adjacent basepairs that are over threshold;
      (3) retains peaks that satisfy min/max length criteria; (4) merges any peaks that are closer
      than the inter-peak distance cutoff
    - max length is typically defaulted to be very large
    - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop)
    - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will 
    be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format
    - note the peak score for merged peak is the *just* the sum of the two individual peaks not the 
    total score in the merged region (i.e. there could be some sub-threshold scores in the intervening 
    space that won't be included)
    -assumes bedgraph in standard format <chr> <start> <stop> <score>
    -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed
    """
    
    import pybedtools
    import glob
    from pybedtools import BedTool
    import pandas as pd
    
    #generate name for output
    bedgraph_name = glob.glob(bedgraph)
    
    if output_name != None:
        filename = output_name
        
    elif output_name == None:
        filename = bedgraph_name[0].replace('.bg', '_peaks.bed')
        
    print 'input bedgraph file: ' + bedgraph_name[0]
    print 'output filename: ' + filename
    
    #import data as BedTool
    data = BedTool(bedgraph) 
    
    #retains intervals above threshold
    above_thresh = data.filter(lambda b: float(b.name) >= threshold) 
    
    #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4)
    #by increasing d value can allow for 
    merge_regions= above_thresh.merge(d=0, c=4, o='sum' )
    
    #filter based on length criteria
    peaks = BedTool(merge_regions.filter(lambda x: len(x) >= min_length and len(x) <= max_length))
    
    #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort
    merge_peaks = peaks.merge(d=inter_peak_distance, c= 4, o='sum').sort()
    
    print 'number of peaks found: ' + str(merge_peaks.count())
    
    if not generate_ID:
        print 'saving sorted peak bed file with no ID'
        
        merge_peaks.saveas(filename)
        
    if generate_ID:
        print 'saving sorted peak bed file with ID names'
        
        #change to pandas dataframe
        DF_peaks = merge_peaks.to_dataframe()
        
        #insert new column with id: 1.... # of peaks
        DF_peaks.insert(3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks)+1))])
        
        ['id' + str(item)  for item in range(1, 5)]
        #save output
        DF_peaks.to_csv(filename, sep = '\t', header = False, index = False)
        
    return 'Finished'
Exemplo n.º 22
0
    def create(
        cls: Type[T],
        outdir: str,
        data_files: List[str],
        enhancer_file: str,
        annotation_file: str,
        genome: str,
        window: Optional[int] = 2000,
        anno_file: Optional[str] = None,
        anno_from: Optional[str] = None,
        anno_to: Optional[str] = None,
        gene_mapping: Optional[str] = None,
        threshold: Optional[float] = 1.0,
        version: Optional[str] = "0.1.0",
    ) -> T:
        outdir = Path(outdir)
        basename = outdir.name
        meanstd_file = outdir / f"{basename}.{genome}.meanstd.tsv.gz"
        target_file = outdir / f"{basename}.{genome}.target.npz"
        gene_file = outdir / "annotation.tss.merged1kb.bed"
        link_file = outdir / "enhancers2genes.feather"

        g = Genome(genome)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        info = {
            "genes": "genes.txt",
            "enhancers": "enhancers.feather",
            "link_file": os.path.basename(link_file),
            "genome": genome,
            "window": window,
            "meanstd_file": os.path.basename(meanstd_file),
            "target_file": os.path.basename(target_file),
            "gene_file": os.path.basename(gene_file),
            "version": version,
            "schema_version": __schema_version__,
        }

        if anno_file is not None:
            if not os.path.exists(anno_file):
                raise ValueError(f"{anno_file} does not exist")
            if anno_from is None or anno_to is None:
                raise ValueError("Need anno_from and anno_to columns!")
            copyfile(anno_file, outdir / os.path.basename(anno_file))
            info.update({
                "anno_file": os.path.basename(anno_file),
                "anno_from": anno_from,
                "anno_to": anno_to,
            })

        if gene_mapping is not None:
            if not os.path.exists(gene_mapping):
                raise ValueError(f"{gene_mapping} does not exist")
            copyfile(gene_mapping, outdir / os.path.basename(gene_mapping))
            info["gene_mapping"] = os.path.basename(gene_mapping)

        logger.info("processing gene annotation")
        # Convert gene annotation
        b = BedTool(annotation_file)
        chroms = set([f.chrom for f in BedTool(enhancer_file)])
        b = b.filter(lambda x: x.chrom in chroms)

        b = (b.flank(g=g.sizes_file, l=1,
                     r=0).sort().merge(d=1000, c=4,
                                       o="distinct"))  # noqa: E741
        b.saveas(str(gene_file))

        logger.info("processing data files")
        # create coverage_table
        df = coverage_table(
            enhancer_file,
            data_files,
            window=window,
            log_transform=True,
            normalization="quantile",
            ncpus=12,
        )

        df.index.rename("loc", inplace=True)
        df.reset_index().to_feather(f"{outdir}/enhancers.feather")
        np.savez(target_file, target=df.iloc[:, 0].sort_values())
        meanstd = pd.DataFrame(index=df.index, )
        meanstd["mean"] = df.mean(1)
        meanstd["std"] = df.std(1)
        meanstd = meanstd.reset_index().rename(columns={"loc": "index"})

        meanstd.to_csv(meanstd_file, compression="gzip", index=False, sep="\t")
        df.index.rename("loc", inplace=True)
        df = df.sub(df.mean(1), axis=0)
        df = df.div(df.std(1), axis=0)
        df.reset_index().to_feather(f"{outdir}/enhancers.feather")

        link = create_link_file(meanstd_file, gene_file, genome=genome)
        link.to_feather(link_file)

        genes = _create_gene_table(
            df,
            meanstd_file,
            gene_file,
            gene_mapping,
            genome=genome,
            link_file=link_file,
            threshold=threshold,
        )
        genes.to_csv(f"{outdir}/genes.txt", sep="\t")

        with open(f"{outdir}/info.yaml", "w") as f:
            yaml.dump(info, f)

        return ScepiaDataset(outdir)
Exemplo n.º 23
0
def _update_cnv_cnv_redundances(vids_to_remove: Set[Text],
                                potentially_clusterable: pybedtools.BedTool,
                                is_carrier: Mapping[Text, numpy.ndarray],
                                is_ref: Mapping[Text, numpy.ndarray],
                                cnv_cnv_reciprocal_overlap: float,
                                cnv_cnv_sample_overlap: float):
    """
    Update vids_to_remove by finding CNVs that are redundant with other CNVs (as opposed to CPX)
    -Find CNVs with very high reciprocal overlap, and very high carrier sample Jaccard index
    -For each CNV that is connected to any other CNVs
        Add that CNV and all its connections to vids_to_remove
        Find the "best" CNV: the maximum choosing 1st by number of carriers, 2nd by number of called refs
        Add the best CNV to set of vids that will be put back in (no matter what, even if previously or subsequently
        "removed")
    -Update vids_to_remove by removing the "best" variant IDs

    Parameters
    ----------
    vids_to_remove: Set[Text]
        set of variant IDs that are redundant and should be removed. NOTE: this function updates this set in place.
    potentially_clusterable: BedTool
        bed object with intervals that could potentially be used for clustering
    is_carrier: Mapping[Text, numpy.ndarray]
        Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
        otherwise (including no-call).
    is_ref: Mapping[Text, numpy.ndarray]
        Map from variant ID to boolean array that is True for samples called ref for this Variant, and False otherwise
        (including no-call).
    cnv_cnv_reciprocal_overlap: float
        minimum reciprocal overlap for two CNVs to be connected
    cnv_cnv_sample_overlap: float
        minimum carrier samples Jaccard index for two CNVs to be connected
    """
    # for each non-CPX interval, find all non-CPX intervals it has sufficient reciprocal overlap and sample overlap with
    variant_pairwise_connections = {}

    non_cpx_potentially_clusterable = potentially_clusterable.filter(
        _is_not_cpx).saveas()
    for name_1, name_2 in _iter_pairwise_connections(
            non_cpx_potentially_clusterable,
            min_reciprocal_overlap=cnv_cnv_reciprocal_overlap,
            min_sample_overlap=cnv_cnv_sample_overlap,
            is_carrier=is_carrier):
        variant_pairwise_connections[
            name_1] = variant_pairwise_connections.get(name_1,
                                                       (name_1, )) + (name_2, )

    vids_to_remove.update(variant_pairwise_connections.keys()
                          )  # set all the clustered variants to be removed

    # for each of these variant and its direct connections
    #    - choose one "best" variant to represent it, with priority given to most carriers, followed by most ref calls
    #    - keep the "best" variant (even if it's previously or subsequently "removed") and remove all others
    num_carrier = {
        variant_id: variant_is_carrier.sum()
        for variant_id, variant_is_carrier in is_carrier.items()
    }
    num_ref = {
        variant_id: variant_is_ref.sum()
        for variant_id, variant_is_ref in is_ref.items()
    }

    def _best_variant_id(variant_id: Text) -> (int, int, str):
        return num_carrier[variant_id], num_ref[variant_id], variant_id

    # then remove the best ones
    vids_to_remove.difference_update(
        max(variant_id_cluster, key=_best_variant_id)
        for variant_id_cluster in variant_pairwise_connections.values())
Exemplo n.º 24
0
maxdist=6

seqnames=list()

for a in amplicons:
        seqnames.append(a.chrom)

def chrom_filter(feature,chrom):
	return feature.chrom==chrom

for chr in set(seqnames):
	left_lengths=dict()
	right_lengths=dict()
	a_starts=dict()
	a_ends=dict()
	amplicons_chrom=amplicons.filter(chrom_filter, chrom=chr)
	for a in amplicons_chrom:
		left_lengths[a.name]=map(int,a.fields[10].split(","))[0]
		right_lengths[a.name]=map(int,a.fields[10].split(","))[1]
		a_starts[a.name]=a.start
		a_ends[a.name]=a.stop

	chrom=chr[3:]
	print chrom
	for read in samfile.fetch(str(chrom)):
		if read.is_reverse:
			if read.is_unmapped==False:
				dists=dict()
				abs_dists=dict()
				for k, v in a_ends.items():
					dists[k]=read.reference_end-v
Exemplo n.º 25
0
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance,
                    merge_close_peaks, keep_highest_close_peak, max_length,
                    generate_ID, output_name, delete_overlap_bed):
    import pybedtools
    import glob
    from pybedtools import BedTool
    import pandas as pd
    import csv

    if merge_close_peaks == keep_highest_close_peak:
        print 'Exiting... merge_close_peaks and keep_highest_close_peak set the same'
        sys.exit()

    #generate name for output
    bedgraph_name = glob.glob(bedgraph)
    filtered_name = bedgraph_name[0].replace('.bedgraph', 'filtered.bedgraph')

    if output_name != 'None':
        filename = output_name

    elif output_name == 'None':
        filename = bedgraph_name[0].replace('.bedgraph', '_peaks.bed')

    print 'input bedgraph file: ' + bedgraph_name[0]
    print 'output filename: ' + filename

    #import data as BedTool
    data = BedTool(bedgraph)
    print 'total sites read: ',
    print len(data)

    #retains intervals above threshold
    above_thresh = data.filter(
        lambda b: float(b.name) >= float(threshold)).saveas(filtered_name)
    print 'sites above threshold: ',
    print len(above_thresh)
    if len(above_thresh) == 0:
        print 'no regions are above the threashold\n'
        sys.exit()

    #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4)
    #d max distance between merged peaks, c: column modified
    merge_regions = above_thresh.merge(d=10, c=4, o='sum').saveas('temp.bed')

    #filter based on length criteria
    peaks = BedTool(
        merge_regions.filter(lambda x: len(x) >= min_length and len(x) <=
                             max_length)).saveas('temp2.bed')
    print 'number of regions identified: ' + str(peaks.count())

    if merge_close_peaks == 'True':
        if len(peaks) == 0:
            #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort
            print 'merging peaks that are closer than: ' + str(
                inter_peak_distance)
            merge_peaks = peaks.merge(d=inter_peak_distance, c=4,
                                      o='sum').sort().saveas('temp3.bed')
        if len(peaks) > 0:
            print 'no regions can be merged'
            merge_close_peaks = 'False'
            keep_highest_close_peak = 'True'

    if keep_highest_close_peak == 'True':
        #need to read each line to find close peaks and throw away the one with the lowest score out of the two
        print 'entering loop'
        #        if len(peaks) > 0:
        peaks.saveas('temp_input.bed')

        #print 'before keeping highest, number of regions identified: ' + str(BedTool('temp_input.bed').count())

        last_line = [
            str(item)
            for item in (BedTool('temp_input.bed').to_dataframe().tail(
                n=1).iloc[0, :].tolist())
        ]

        with open('temp_input.bed') as myfile:
            with open('test_output.bed', 'w') as output:
                file_output = csv.writer(output, delimiter='\t')

                prev_line = None

                for line in csv.reader(myfile, delimiter='\t'):
                    print 'testing line: ' + str(line)

                    if prev_line is None:
                        prev_line = line
                        print

                    elif float(prev_line[2]) + float(
                            inter_peak_distance) <= float(line[1]):
                        print 'prev_line: ' + str(prev_line)
                        print 'line: ' + str(line)
                        print 'features far apart, so adding'
                        print
                        file_output.writerow(prev_line)
                        prev_line = line

                    else:
                        print 'prev_line: ' + str(prev_line)
                        print 'line: ' + str(line)
                        print 'features must be close'
                        print
                    if float(prev_line[3]) < float(line[3]):
                        prev_line = line
                        print 'prev_line smaller, so new prev_line'
                        print 'prev_line: ' + str(prev_line)
                        print

                print 'finished reading lines'
                print line
                print last_line
                if line == last_line:
                    print 'must be last line'
                    file_output.writerow(prev_line)

        merge_peaks = BedTool('test_output.bed')
    sys.exit()
    print 'number of peaks found: ' + str(merge_peaks.count())

    if delete_overlap_bed != None:
        print 'delete_overlap_bed provided: ' + delete_overlap_bed
        merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True)
        print 'number of peaks retained: ' + str(merge_peaks.count())

    if not generate_ID:
        print 'saving sorted peak bed file with no ID'

        merge_peaks.saveas(filename)

    if generate_ID:
        print 'saving sorted peak bed file with ID names'

        #change to pandas dataframe
        DF_peaks = merge_peaks.to_dataframe()

        #insert new column with id: 1.... # of peaks
        DF_peaks.insert(
            3, 'id',
            ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))])

        ['id' + str(item) for item in range(1, 5)]
        #save output
        DF_peaks.to_csv(filename, sep='\t', header=False, index=False)

    return 'Finished'
Exemplo n.º 26
0
def find_longest_transcript(input,
                            output,
                            clip_start=0,
                            clip_end=0,
                            clip_strand_specific=False,
                            output_strand_specific=False):
    tmp_file = tempfile.NamedTemporaryFile(delete=False)

    # Load GFF file
    annotations = BedTool(input)

    # Select only transcripts and convert to BED format
    transcripts = annotations.filter(filter_transcript).\
        each(gff2bed, name_field="gene_id").sort().\
        saveas(tmp_file.name).\
        to_dataframe().\
        assign(length=lambda x: x.end - x.start + 1)

    # Select longest transcript per gene
    transcripts_longest = transcripts.loc[transcripts.reset_index().groupby(['name'])['length'].idxmax()].\
        drop(['length'], axis=1)

    # Clip at the end or the beginning of the gene (can be strand specific)
    pos_strand = (transcripts_longest["strand"]
                  == "+").values | np.invert(clip_strand_specific)
    if clip_start > 0:
        transcripts_longest.loc[pos_strand, "start"] = transcripts_longest.loc[
            pos_strand, "start"] + clip_start
        transcripts_longest.loc[
            ~pos_strand,
            "end"] = transcripts_longest.loc[~pos_strand, "end"] - clip_end
    if clip_end > 0:
        transcripts_longest.loc[
            pos_strand,
            "end"] = transcripts_longest.loc[pos_strand, "end"] - clip_end
        transcripts_longest.loc[~pos_strand,
                                "start"] = transcripts_longest.loc[
                                    ~pos_strand, "start"] + clip_start

    # Notify about genes with negative length
    transcripts_negative_length = transcripts_longest.query(
        "start >= end").name.values
    if len(transcripts_negative_length) > 0:
        transcripts_longest = transcripts_longest.query("end > start")
        print(
            "Removing transcripts with negative length from the output file: {}"
            .format(", ".join(transcripts_negative_length)))

    # make a copy of genes and reverse strand
    transcripts_longest_reversed = transcripts_longest.copy()
    transcripts_longest_reversed["strand"] = [
        "+" if s == "-" else "-"
        for s in transcripts_longest_reversed["strand"].values
    ]
    transcripts_longest_reversed[
        "name"] = transcripts_longest_reversed["name"] + "_rev"
    transcripts_longest_stack = pd.concat([transcripts_longest, transcripts_longest_reversed]).\
        sort_values(["chrom", "start", "name", "strand"]).\
        reset_index(drop=True)

    # Save final data frame to file
    if not output_strand_specific:
        transcripts_longest_stack.to_csv(output,
                                         index=False,
                                         sep="\t",
                                         header=False)
        pass
    else:
        for s, s_name in {'+': 'pos', '-': 'neg'}.items():
            transcripts_longest_strand = transcripts_longest_stack.query(
                "strand == @s")
            transcripts_longest_strand.to_csv("{}_{}".format(output, s_name),
                                              index=False,
                                              sep="\t",
                                              header=False)
Exemplo n.º 27
0
import numpy as np
import pandas as pd

os.chdir(
    '/Users/dem44/Documents/Manuscripts/cuRRBS/Figures/Figure_3/3X/exon_intron_sites/'
)

#### 1. Obtain the coordinates for the 5' end and 3' end of all the introns in the
#       human genome (hg38).

print('Retrieving introns from the human genome ...')

path_to_gencode_ann = "/Users/dem44/Documents/Manuscripts/cuRRBS/Figures/Figure_1/1C/Annotation_files/gencode.v25.basic.annotation.gtf"
gencode_ann = BedTool(path_to_gencode_ann).sort()

protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter(
    lambda x: 'gene_type "protein_coding"' in x[8]).sort()
exon_protein_coding_ann = gencode_ann.filter(lambda x: x[2] == 'exon').filter(
    lambda x: 'gene_type "protein_coding"' in x[8]).sort()
intron_protein_coding_ann = protein_coding_genes_ann.subtract(
    exon_protein_coding_ann, s=True).sort()

five_prime_ends = [x.start
                   for x in intron_protein_coding_ann]  # 0-based coordinates
three_prime_ends = [(x.end - 1)
                    for x in intron_protein_coding_ann]  # 0-based coordinates
chromosomes = [str(x.chrom) for x in intron_protein_coding_ann]

#### 2. Find the coordinates of all the CpG sites that are found in +- 5 bp of both
#       of the ends of the intron (i.e. close to the exon-intron boundary).

print('Finding the CpG coordinates ...')
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(
        description='Use a sliding window to aggregate breaks in bed file')
    parser.add_argument('genome',
                        help='Name of the model used to produce input')
    parser.add_argument('input', help='Input .bed file with detected breaks')
    parser.add_argument(
        'annotations',
        help=
        'Annotation file. If annotation file has gtf or gff extention (possibly .gz) then only transcripts are selected. If .bed file is provided then all annotations from bed file are used'
    )
    parser.add_argument('output',
                        help='Output .bed file with longest transcripts')
    parser.add_argument('-w|--window-size',
                        dest="window_size",
                        default=int(1e5),
                        type=int,
                        help='Window at which to agregate breaks number')
    parser.add_argument('-s|--window-step',
                        dest="window_step",
                        default=int(1e4),
                        type=int,
                        help='Step after each window')
    parser.add_argument('-f|--features',
                        dest="features",
                        action="append",
                        nargs="*",
                        help='Additional features to annotate input file')

    args = parser.parse_args()
    start = time.time()

    if args.features is None:
        features = []
    else:
        features = list(itertools.chain.from_iterable(args.features))

    output_dir = os.path.dirname(args.output)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(
        'Processing "{input}" using annotation="{annotation}" window {window}/{step}. Writing output to "{output}"...'
        .format(input=args.input,
                window=args.window_size,
                step=args.window_step,
                output=args.output,
                annotation=args.annotations))

    # Create temporary files
    tmp = {
        n: tempfile.NamedTemporaryFile(delete=False).name
        for n in [
            "genome_bin_pos", "genome_bin_neg", "genome_bin", "breaks_bin",
            "results", "all_transcripts", "transcripts"
        ]
    }

    # Create windows template for sliding window
    genome_bin_pos = BedTool().window_maker(
        genome=args.genome, w=args.window_size,
        s=args.window_step).each(strand, "+").saveas(tmp["genome_bin_pos"])
    genome_bin_neg = BedTool().window_maker(
        genome=args.genome, w=args.window_size,
        s=args.window_step).each(strand, "-").saveas(tmp["genome_bin_neg"])
    genome_bin = genome_bin_pos.cat(
        genome_bin_neg, postmerge=False).sort().saveas(tmp["genome_bin"])

    # Read input file
    dna_breaks = BedTool(args.input)

    # Read annotation file
    if re.search(r"\.(gtf|gff)(\.gz)?$", args.annotations):
        annotations = BedTool(args.annotations)
        annotations = annotations.filter(filter_transcript).\
            each(gff2bed, name_field="gene_id").sort().\
            saveas(tmp["all_transcripts"]).\
            groupby(g="1,2,3,6", c="4,5", o="distinct").\
            cut([0,1,2,4,5,3]).\
            saveas(tmp["transcripts"])
    elif re.search(r"\.bed$", args.annotations):
        annotations = BedTool(args.annotations)
    else:
        parser.error(
            "Annotation have to be either in gtf/gff or in bed format")

    bin_breaks = BedTool().intersect(a=genome_bin, b=dna_breaks, wa=True, c=True, s=True). \
        saveas(tmp["breaks_bin"])

    # Map breaks statistics to annotation file
    results = BedTool().map(a=bin_breaks, b=annotations, c="4",
                            o="distinct").cut([0, 1, 2, 7, 6,
                                               5]).sort().saveas(
                                                   tmp["results"])  # s=True,
    results_df = splitDataFrameList(results.to_dataframe(), "name", ",")
    results_df = results_df[results_df.name != "."]
    results_df.to_csv(args.output, sep="\t", header=True, index=False)

    # Remove old temporary files
    for f in tmp.values():
        os.remove(f)

    end = time.time()
    print("Total time: {:.1f} minutes".format((end - start) / 60))
Exemplo n.º 29
0
def py_peak_calling(bedgraph,
                    threshold,
                    min_length,
                    inter_peak_distance,
                    merge_close_peaks=True,
                    keep_highest_close_peak=False,
                    max_length=10000,
                    generate_ID=True,
                    output_name=None,
                    delete_overlap_bed=None):
    """
    - need to install a more up-to-date varsion of bedtools before invoking Jupyter
      type: module load bedtools/2.21.0
	(1) filters bedgraph based on threshold;
	
	(2) merges adjacent basepairs that are over threshold;
	
  (3) retains peaks that satisfy min/max length criteria; 
	
	(4) merges any peaks that are closer than the inter-peak distance cutoff -or-
  alternatively keeps just the highest peak (this is beta functionality)
	
    - max length is typically defaulted to be very large
    - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop)
    - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will 
    be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format
    - note the peak score for merged peak is the *just* the sum of the two individual peaks not the 
    total score in the merged region (i.e. there could be some sub-threshold scores in the intervening 
    space that won't be included)
    -assumes bedgraph in standard format <chr> <start> <stop> <score>
    -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed
    -delete_overlap_bed = option to add path to bedfile (as string), whereby any peaks that overlap this bed file will be discarded
    """

    import pybedtools
    import glob
    from pybedtools import BedTool
    import pandas as pd
    import csv

    if merge_close_peaks == keep_highest_close_peak:
        return 'Exiting... merge_close_peaks and keep_highest_close_peak set the same'

    #generate name for output
    bedgraph_name = glob.glob(bedgraph)

    if output_name != None:
        filename = output_name

    elif output_name == None:
        filename = bedgraph_name[0].replace('.bg', '_peaks.bed')

    print 'input bedgraph file: ' + bedgraph_name[0]
    print 'output filename: ' + filename

    #import data as BedTool
    data = BedTool(bedgraph)

    #retains intervals above threshold
    above_thresh = data.filter(lambda b: float(b.name) >= threshold)

    #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4)
    #by increasing d value can allow for
    merge_regions = above_thresh.merge(d=0, c=4, o='sum')

    #filter based on length criteria
    peaks = BedTool(
        merge_regions.filter(
            lambda x: len(x) >= min_length and len(x) <= max_length))

    #     print 'number of regions identified before merging or filtering: ' + str(peaks.count())

    if merge_close_peaks == True:
        #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort
        print 'merging peaks that are closer than: ' + str(inter_peak_distance)
        merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort()

    if keep_highest_close_peak == True:
        #need to read each line to find close peaks and throw away the one with the lowest score out of the two
        print 'entering loop'

        peaks.saveas('temp_input.bed')

        print 'before keeping highest, number of regions identified: ' + str(
            BedTool('temp_input.bed').count())

        last_line = [
            str(item)
            for item in (BedTool('temp_input.bed').to_dataframe().tail(
                n=1).iloc[0, :].tolist())
        ]

        with open('temp_input.bed') as myfile:
            with open('test_output.bed', 'w') as output:
                file_output = csv.writer(output, delimiter='\t')

                prev_line = None

                for line in csv.reader(myfile, delimiter='\t'):
                    #                     print 'testing line: ' +str(line)

                    if prev_line is None:
                        prev_line = line
#                         print

                    elif float(prev_line[2]) + float(
                            inter_peak_distance) <= float(line[1]):
                        #                         print 'prev_line: ' + str(prev_line)
                        #                         print 'line: ' + str(line)
                        #                         print 'features far apart, so adding'
                        #                         print
                        file_output.writerow(prev_line)
                        prev_line = line

                    else:
                        #                         print 'prev_line: ' + str(prev_line)
                        #                         print 'line: ' + str(line)
                        #                         print 'features must be close'
                        #                         print
                        if float(prev_line[3]) < float(line[3]):
                            prev_line = line
#                             print 'prev_line smaller, so new prev_line'
#                             print 'prev_line: ' + str(prev_line)
#                             print

#                 print 'finished reading lines'
#                 print line
#                 print last_line
                if line == last_line:
                    #                     print 'must be last line'
                    file_output.writerow(prev_line)

            merge_peaks = BedTool('test_output.bed')

    print 'number of peaks found: ' + str(merge_peaks.count())

    if delete_overlap_bed != None:
        print 'delete_overlap_bed provided: ' + delete_overlap_bed
        merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True)
        print 'number of peaks retained: ' + str(merge_peaks.count())

    if not generate_ID:
        print 'saving sorted peak bed file with no ID'

        merge_peaks.saveas(filename)

    if generate_ID:
        print 'saving sorted peak bed file with ID names'

        #change to pandas dataframe
        DF_peaks = merge_peaks.to_dataframe()

        #insert new column with id: 1.... # of peaks
        DF_peaks.insert(
            3, 'id',
            ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))])

        ['id' + str(item) for item in range(1, 5)]
        #save output
        DF_peaks.to_csv(filename, sep='\t', header=False, index=False)

    return 'Finished'
Exemplo n.º 30
0
        get_gff(GFF_URL, GENOMICS_DIR)

    if not os.path.isfile(CPG_PATH):
        logging.info("Downloading CpG metadata at " + CPG_PATH)
        get_cpgs(CPG_URL, GENOMICS_DIR)

    # derive promoter gff and extract sequences from genome
    if os.path.isfile(PROMS_GFF_PATH):
        proms_bed = BedTool(PROMS_GFF_PATH)
        logging.info("Found proms gff at " + PROMS_GFF_PATH)
    else:
        # point to genome gff3
        genome_bed = BedTool(GENOME_GFF_PATH)

        # filter for genes
        genes_bed = (genome_bed.filter(lambda x: (x[2] == 'gene') and (
            x.chrom in CHROMOSOMES)).saveas(GENES_GFF_PATH))
        logging.info("Extracted # genes = " + str(len(genes_bed)) +
                     ", saved at " + GENES_GFF_PATH)

        # extract promoters from genes features
        proms_bed = genes_bed.each(func=five_prime,
                                   upstream=UPSTREAM_LENGTH,
                                   downstream=0)
        proms_bed, genes_data = add_geneIDs(proms_bed)

        with open(GENOMICS_DIR + "genes.json", 'w+') as f:
            json.dump(genes_data, f)
        proms_bed.saveas(PROMS_GFF_PATH)

        logging.info("Extracted # promoters = " + str(len(proms_bed)) +
                     ", saved at " + PROMS_GFF_PATH)
Exemplo n.º 31
0
def getListOfBlackZones(chrom):
    blackList = BedTool('../wgEncodeDacMapabilityConsensusExcludable.bed')
    blackListChrom = blackList.filter(lambda b: b.chrom == chrom)
    return [(i.start, i.end) for i in blackListChrom]
Exemplo n.º 32
0
perc_intergenic_sites = []

perc_CGI_sites = []
perc_shore_sites = []
perc_shelf_sites = []

perc_promoter_CGI_sites = []
perc_promoter_non_CGI_sites = []

## Annotation files.

print('Calculating annotations ...')

gencode_ann = BedTool(path_to_gencode_ann).sort()

protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter(
    lambda x: 'gene_type "protein_coding"' in x[8]).sort()
exon_protein_coding_ann = gencode_ann.filter(lambda x: x[2] == 'exon').filter(
    lambda x: 'gene_type "protein_coding"' in x[8]).sort()
intron_protein_coding_ann = protein_coding_genes_ann.subtract(
    exon_protein_coding_ann, s=True).sort()
non_coding_RNA_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter(
    lambda x: ('gene_type "Mt_rRNA"' in x[8]) or
    ('gene_type "Mt_tRNA"' in x[8]) or ('gene_type "miRNA"' in x[8]) or
    ('gene_type "misc_RNA"' in x[8]) or ('gene_type "rRNA"' in x[8]) or
    ('gene_type "scRNA"' in x[8]) or ('gene_type "snRNA"' in x[8]) or
    ('gene_type "snoRNA"' in x[8]) or ('gene_type "ribozyme"' in x[8]) or
    ('gene_type "sRNA"' in x[8]) or ('gene_type "scaRNA"' in x[8]) or
    ('gene_type "lincRNA"' in x[8])).sort()
intragenic_ann = protein_coding_genes_ann.slop(g=path_to_chr_lengths,
                                               b=2500).merge().sort()
intergenic_ann = intragenic_ann.complement(g=path_to_chr_lengths).sort()