Exemplos de BedTool.flank em Python, exemplos de pybedtools.BedTool.flank em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: extract_promoter.py Projeto: Switham1/PromoterArchitecture

def add_promoter(genes_gff, chromsize, promoter_length):
    """This function adds a promoter of a certain length to each gene in the input file and exports an output pyBedTools object"""
    # output = open(output_location, 'w') #make output file with write capability
    # parse gff file containing only genes.
    genes = BedTool(genes_gff)
    # extract promoters upsteam using chromsize file and specified promoter length. r, no. of bp to add to end coordinate. s, based on strand.
    promoters = genes.flank(g=chromsize, l=promoter_length, r=0, s=True)

    return promoters

Exemplo n.º 2

0

Exibir arquivo

class GenomicSubset(object):
    def __init__(self, name, path=paths.genome_subsets, assembly='hg19'):
        self.assembly = assembly
        self.name = name
        self.bedtool = BedTool(path + name + '.bed').sort()

        # Intersect the pathway with the appropriate genome build
        # TODO: this step should be unnecessary if the pathways are correct
        if name != self.assembly:
            self.bedtool = GenomicSubset.reference_genome(
                    self.assembly).bedtool.intersect(self.bedtool).sort().saveas()

    def expand_by(self, expansion_in_each_direction_Mb):
        window_size_str = str(expansion_in_each_direction_Mb) + 'Mb'
        print('total size before window addition:', self.bedtool.total_coverage(), 'bp')

        # compute the flanks
        # TODO: use 1cM instead of 1Mb
        print('computing flanks')
        flanks = self.bedtool.flank(
            genome=self.assembly,
            b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas()

        # compute the union of the flanks and the pathway
        print('computing union')
        union = self.bedtool.cat(flanks, postmerge=False).sort()
        merged = union.merge().saveas()
        print('total size after window addition:', merged.total_coverage(), 'bp')
        self.bedtool = merged

    def restricted_to_chrom_bedtool(self, chrnum):
        return self.bedtool.filter(
                lambda x : x[0] == 'chr' + str(int(chrnum))).saveas()

    @classmethod
    def reference_genome(cls, assembly='hg19'):
        return GenomicSubset(assembly, path=paths.reference, assembly=assembly)

    @classmethod
    def reference_chrom_bedtool(cls, chrnum, assembly='hg19'):
        return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum)

    @classmethod
    def whole_genome(cls, assembly='hg19'):
        return cls(assembly, path=paths.reference)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: network.py Projeto: andrewbcaldwell/ANANSE

    def get_gene_dataframe(self, peak_bed, up=100000, down=100000):
        # all overlap Enh-TSS(100000-tss-100000) pair with distance
        peaks = BedTool(peak_bed)
        b = BedTool(self.gene_bed)
        b = b.flank(l=1, r=0, s=True, g=self.gsize).slop(  # noqa: E741
            l=up,
            r=down,
            g=self.gsize,
            s=True  # noqa: E741
        )
        # bedtools flank  -r 0 -l 1 -i b.bed -g
        # #all gene upstream 1bp position (TSS), Chr01 12800   12801   in Chr01    4170    12800   Xetrov90000001m.g   0   -
        # bedtools slop  -r down -l up -i b.bed -g
        # # |100000--TSS--100000|

        vals = []
        # for f in b.intersect(peaks, wo=True, nonamecheck=True):
        for f in b.intersect(peaks, wo=True):
            # bedtools intersect -wo -nonamecheck -b peaks.bed -a b.bed
            chrom = f[0]
            strand = f[5]
            if strand == "+":
                tss = f.start + up
            else:
                tss = f.start + down
            gene = f[3]
            peak_start, peak_end = int(f[13]), int(f[14])
            vals.append([chrom, tss, gene, peak_start, peak_end])
        p = pd.DataFrame(
            vals, columns=["chrom", "tss", "gene", "peak_start", "peak_end"])
        p["peak"] = [int(i) for i in (p["peak_start"] + p["peak_end"]) / 2]
        # peak with int function, let distance int
        p["dist"] = np.abs(p["tss"] - p["peak"])
        p["loc"] = (p["chrom"] + ":" + p["peak_start"].astype(str) + "-" +
                    p["peak_end"].astype(str))
        p = p.sort_values("dist").drop_duplicates(
            ["loc", "gene"], keep="first")[["gene", "loc", "dist"]]

        p = p[p["dist"] < up - 1]
        # remove distance more than 100k interaction, for weight calculate
        p.gene = [i.upper() for i in list(p.gene)]

        return p

Exemplo n.º 4

0

Exibir arquivo

Arquivo: network.py Projeto: andrewbcaldwell/ANANSE

 def get_promoter_dataframe(self, peak_bed, up=2000, down=2000):
     # all overlap Enh-TSS(up2000 to down2000) pair
     peaks = BedTool(peak_bed)
     b = BedTool(self.gene_bed)
     b = b.flank(l=1, r=0, s=True, g=self.gsize).slop(  # noqa: E741
         l=up,
         r=down,
         g=self.gsize,
         s=True  # noqa: E741
     )
     vals = []
     # for f in b.intersect(peaks, wo=True, nonamecheck=True):
     for f in b.intersect(peaks, wo=True):
         chrom = f[0]
         gene = f[3]
         peak_start, peak_end = int(f[13]), int(f[14])
         vals.append([chrom, gene, peak_start, peak_end])
     prom = pd.DataFrame(
         vals, columns=["chrom", "gene", "peak_start", "peak_end"])
     prom["loc"] = (prom["chrom"] + ":" + prom["peak_start"].astype(str) +
                    "-" + prom["peak_end"].astype(str))
     prom.gene = [i.upper() for i in list(prom.gene)]
     return prom

Exemplo n.º 5

0

Exibir arquivo

def mk_matrix(inputfile=None,
              outputfile=None,
              bigwiglist=None,
              ft_type=None,
              pseudo_count=0,
              upstream=1000,
              downstream=1000,
              bin_around_frac=0.1,
              chrom_info=None,
              bin_nb=100,
              nb_proc=None,
              labels=None,
              no_stranded=False,
              zero_to_na=False):
    """
 Description: Create a matrix to be used by 'profile' and 'heatmap' commands.
    """

    # -------------------------------------------------------------------------
    # Check argument consistency
    #
    # -------------------------------------------------------------------------

    if ft_type in ['single_nuc', 'promoter', 'tts']:
        region_size = upstream + downstream + 1
        if region_size < bin_nb:
            message(
                "The region (-u/-d) needs to be extended given the number "
                "of bins (--bin-nb)",
                type="ERROR")

    # -------------------------------------------------------------------------
    # Check output file name does not ends with .zip
    #
    # -------------------------------------------------------------------------

    if outputfile.name.endswith(".zip"):
        outfn = outputfile.name.replace(".zip", "")
        outputfile = open(outfn, "w")

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
        if ft_type == 'user_regions':
            message(
                "--ft-type can not be set to user_regions"
                " when a gtf is provided.",
                type="ERROR")
    else:
        try:

            region_bo = BedTool(inputfile.name)
            len(region_bo)
        except IndexError:
            message("Unable to read the input file. Check format",
                    type="ERROR")
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            message('Loading the GTF file.')
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

            if ft_type != 'user_regions' and ft_type != 'single_nuc':
                message(
                    "Set --ft-type to 'user_regions' or 'single_nuc'"
                    " when using input bed file.",
                    type="ERROR")
            # Check that the strand is provided and
            # check it is located in the right column
            # (not checked by BedTool...).
            if region_bo.field_count() < 6:
                if not no_stranded:
                    message("Strand is undefined. Use -nst.", type="ERROR")
            else:
                region_name = dict()
                for i in region_bo:
                    if region_name.get(i.name, None) is None:
                        region_name[i.name] = 1
                    else:
                        message(
                            "Regions in bed file should have "
                            "unique identifier (col 4).",
                            type="ERROR")
                    if i.strand[0] not in ['.', '+', '-']:
                        message("Strand should be one of '+','-' or '.'.",
                                type="ERROR")
                    if ft_type == 'single_nuc':
                        if i.end - i.start != 1:
                            message(
                                "Region length should be 1 nucleotide "
                                "long when 'single_nuc' is set. Use 'user_regions'.",
                                type="ERROR")
                    elif ft_type == 'user_regions':
                        if i.end - i.start == 1:
                            message(
                                "Region length should not be 1 nucleotide "
                                "long when 'user_regions' is set. Use 'single_nuc'.",
                                type="ERROR")

    # -------------------------------------------------------------------------
    # Create a list of labels for the diagrams.
    # Take user input in account
    # -------------------------------------------------------------------------
    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bigwiglist):
            message(
                "The number of labels should be the same as the number of"
                " bigwig files.",
                type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bigwiglist)):
            labels += [
                os.path.splitext(os.path.basename(bigwiglist[i].name))[0]
            ]

    # -------------------------------------------------------------------------
    #
    # Get the requested transcrit lines in bed format
    # Tx are restricted to those found on chromosome
    # declared in the bigwig file.
    # -------------------------------------------------------------------------
    message('Getting the list of chromosomes declared in bigwig files.')
    bw_chrom = list()
    for i in bigwiglist:
        bw_chrom += list(pyBigWig.open(i.name).chroms().keys())

    bed_col = [0, 1, 2, 3, 4, 5]

    if is_gtf:

        message('Selecting chromosomes declared in bigwig from gtf.')
        tmp = gtf.select_by_key("feature", "transcript").select_by_key(
            "seqid", ",".join(bw_chrom))

        tmp = gtf.select_by_key("feature", "transcript")
        tmp_tx_name = tmp.extract_data("transcript_id", as_list=True)

        # If several trancript records are associated to
        # the same transcript_id, raise an error.
        if len(tmp_tx_name) > len(set(tmp_tx_name)):
            message('Transcripts should have a unique identifier.',
                    type="ERROR")

        message('Selecting requested regions.')

        # ----------------------------------------------------------------------
        #
        # Slop tss and promoters.
        # No need if transcript was requested (it will be flanked by upstream
        # and doswnstream regions later on).
        # ----------------------------------------------------------------------

        if ft_type == 'transcript':
            message("Getting transcript boundaries (input gtf).")

            main_region_bo = tmp.to_bed(name=["transcript_id"])

        elif ft_type == 'promoter':

            message("Getting promoter regions [-%d,+%d]." %
                    (upstream, downstream))

            main_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

        elif ft_type == 'tts':

            main_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

    else:
        message("Loading regions")

        if ft_type == 'user_regions':
            main_region_bo = BedTool(inputfile.name).cut(bed_col)
        elif ft_type == 'single_nuc':
            main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)
        else:
            message("Unknown method.")

    # Save for tracability
    main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
    main_region_bo.saveas(main_region_bed.name)

    # -------------------------------------------------------------------------
    #
    # Print a header in the output file
    #
    # -------------------------------------------------------------------------
    message("Preparing comments")

    comments = "#"
    comments += "ft_type:" + ft_type + ";"
    comments += "from:" + str(upstream) + ";"
    comments += "to:" + str(downstream) + ";"
    comments += "labels:" + ",".join(labels) + ";"

    # -------------------------------------------------------------------------
    # Compute coverage of requested region
    # Each worker will send a file
    # -------------------------------------------------------------------------

    outputfile_list = {}
    message("Using %d bins for main region." % bin_nb)

    tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name,
                             nb_proc=nb_proc,
                             big_wig=[x.name for x in bigwiglist],
                             bin_nb=bin_nb,
                             pseudo_count=pseudo_count,
                             stranded=not no_stranded,
                             type="main",
                             labels=labels,
                             outputfile=outputfile.name,
                             zero_to_na=zero_to_na,
                             verbose=pygtftk.utils.VERBOSITY)

    outputfile_list["main"] = tmp_file

    # -------------------------------------------------------------------------
    # If transcript was requested
    # we must process flanking regions
    # We need to retrieve coverage of promoter [-upstream, 0]
    # as transcript coverage window size will depend on transcript length.
    # For promoter the length of windows will be fixed.
    # -------------------------------------------------------------------------

    if ft_type in ['transcript', 'user_regions']:

        # Number of bins for TTS and TSS
        around_bin_nb = int(round(bin_nb * bin_around_frac))
        if around_bin_nb < 1:
            around_bin_nb = 1

        if upstream > 0:

            if ft_type == 'transcript':
                message("Getting promoter (using %d bins)." % around_bin_nb)
                ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                    s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col)

            else:
                message("Getting upstream regions (%d bins)." % around_bin_nb)
                ups_region_bo = main_region_bo.flank(s=True,
                                                     l=upstream,
                                                     r=0,
                                                     g=chrom_info.name)

            upstream_bed_file = make_tmp_file(prefix="upstream_region" +
                                              ft_type,
                                              suffix=".bed")

            ups_region_bo.saveas(upstream_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="upstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["upstream"] = tmp_file

        if downstream > 0:

            if ft_type == 'transcript':
                message("Getting TTS (using %d bins)." % around_bin_nb)
                dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                    s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col)
            else:
                message("Getting downstream regions (%d bins)." %
                        around_bin_nb)

                dws_region_bo = main_region_bo.flank(s=True,
                                                     l=0,
                                                     r=downstream,
                                                     g=chrom_info.name)
            dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
                                         suffix=".bed")

            dws_region_bo.saveas(dws_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="downstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["downstream"] = tmp_file

    # -------------------------------------------------------------------------
    #
    # Merge file using pandas
    #
    # -------------------------------------------------------------------------

    message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG")
    df_main = pd.read_csv(outputfile_list["main"].name, sep="\t")
    # save strand and end
    # They will re-joined added later
    df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']]

    df_start = df_main.pop('start')
    df_end = df_main.pop('end')

    if "upstream" in outputfile_list:
        message("Merging upstream file")
        message("Reading (pandas): " + outputfile_list["upstream"].name,
                type="DEBUG")
        df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t")
        df_up = df_up.drop(['start', 'end'], 1)
        df_main = df_up.merge(df_main.loc[:, df_main.columns],
                              on=['bwig', 'chrom', 'gene', 'strand'])

    if "downstream" in outputfile_list:
        message("Merging downstream file")
        message("Reading (pandas): " + outputfile_list["downstream"].name,
                type="DEBUG")
        df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t")
        df_dws = df_dws.drop(['start', 'end'], 1)
        df_main = df_main.merge(df_dws.loc[:, df_dws.columns],
                                on=['bwig', 'chrom', 'gene', 'strand'])

    # join start and end.
    df_main = df_main.merge(df_copy.loc[:, df_copy.columns],
                            on=['bwig', 'chrom', 'gene', 'strand'])
    df_start = df_main.pop('start')
    df_end = df_main.pop('end')
    df_main.insert(2, 'start', df_start)
    df_main.insert(3, 'end', df_end)

    message("Writing to file")
    outputfile.close()

    with open(outputfile.name, 'a') as f:
        f.write(comments + "\n")
        df_main.to_csv(f,
                       sep="\t",
                       index=False,
                       mode='a',
                       columns=df_main.columns,
                       na_rep='NA')

    # -------------------------------------------------------------------------
    #
    # Compress
    #
    # -------------------------------------------------------------------------

    message("Compressing")
    path = os.path.abspath(outputfile.name)
    filename = os.path.basename(path)
    message("filename: " + filename, type="DEBUG")
    zip_filename = filename + '.zip'
    message("zip_filename: " + zip_filename, type="DEBUG")
    zip_path = os.path.join(os.path.dirname(path), zip_filename)
    message("zip_path: " + zip_path, type="DEBUG")

    with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf:
        zf.write(filename=path, arcname=filename)

    for i in outputfile_list:
        message("deleting " + outputfile_list[i].name)
        os.remove(outputfile_list[i].name)
    os.remove(outputfile.name)

    gc.disable()
    close_properly(inputfile, outputfile)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: create_genomic_annotation.py Projeto: thirumalaivasan95/epigenetic_ageing_clock

        temp_string = l_split[1] + '\t' + 'na' + '\t' + 'na' + '\t' + l_split[2] + '\t' + l_split[2] + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + l_split[0] + '\n'
        gff_string = gff_string + temp_string
    i += 1
target_gff = BedTool(gff_string, from_string=True)
    


## Annotation files.

print('Calculating annotations ...')

gencode_ann = BedTool(path_to_gencode_ann).sort()
protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter(lambda x: 'gene_type "protein_coding"' in x[8]).sort()

CGI_ann = BedTool(path_to_CGI).sort()
shore_ann = CGI_ann.flank(g=path_to_chr_lengths, b=2000).sort()
shelf_ann = CGI_ann.flank(g=path_to_chr_lengths, b=4000).subtract(shore_ann).sort()

ChrHMM_ann = BedTool(path_to_ChrHMM).sort()


## Intersections

print('Performing gene bodies / CGI intersections ...')

in_gene_bodies_cgs = list(set(list(target_gff.intersect(protein_coding_genes_ann).sort().to_dataframe()['attributes']))) # 15319 / 21368 CpGs are in gene bodies

in_CGI_cgs = list(set(list(target_gff.intersect(CGI_ann).sort().to_dataframe()['attributes']))) # 9319 / 21368 CpGs are in CGIs
in_shore_cgs = list(set(list(target_gff.intersect(shore_ann).sort().to_dataframe()['attributes']))) # 7920 / 21368 CpGs are in shores
in_shelf_cgs = list(set(list(target_gff.intersect(shelf_ann).sort().to_dataframe()['attributes']))) # 1138 / 21368 CpGs are in CGIs
# 2991 / 21368 CpGs are in open sea

Exemplo n.º 7

0

Exibir arquivo

    def create(
        cls: Type[T],
        outdir: str,
        data_files: List[str],
        enhancer_file: str,
        annotation_file: str,
        genome: str,
        window: Optional[int] = 2000,
        anno_file: Optional[str] = None,
        anno_from: Optional[str] = None,
        anno_to: Optional[str] = None,
        gene_mapping: Optional[str] = None,
        threshold: Optional[float] = 1.0,
        version: Optional[str] = "0.1.0",
    ) -> T:
        outdir = Path(outdir)
        basename = outdir.name
        meanstd_file = outdir / f"{basename}.{genome}.meanstd.tsv.gz"
        target_file = outdir / f"{basename}.{genome}.target.npz"
        gene_file = outdir / "annotation.tss.merged1kb.bed"
        link_file = outdir / "enhancers2genes.feather"

        g = Genome(genome)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        info = {
            "genes": "genes.txt",
            "enhancers": "enhancers.feather",
            "link_file": os.path.basename(link_file),
            "genome": genome,
            "window": window,
            "meanstd_file": os.path.basename(meanstd_file),
            "target_file": os.path.basename(target_file),
            "gene_file": os.path.basename(gene_file),
            "version": version,
            "schema_version": __schema_version__,
        }

        if anno_file is not None:
            if not os.path.exists(anno_file):
                raise ValueError(f"{anno_file} does not exist")
            if anno_from is None or anno_to is None:
                raise ValueError("Need anno_from and anno_to columns!")
            copyfile(anno_file, outdir / os.path.basename(anno_file))
            info.update({
                "anno_file": os.path.basename(anno_file),
                "anno_from": anno_from,
                "anno_to": anno_to,
            })

        if gene_mapping is not None:
            if not os.path.exists(gene_mapping):
                raise ValueError(f"{gene_mapping} does not exist")
            copyfile(gene_mapping, outdir / os.path.basename(gene_mapping))
            info["gene_mapping"] = os.path.basename(gene_mapping)

        logger.info("processing gene annotation")
        # Convert gene annotation
        b = BedTool(annotation_file)
        chroms = set([f.chrom for f in BedTool(enhancer_file)])
        b = b.filter(lambda x: x.chrom in chroms)

        b = (b.flank(g=g.sizes_file, l=1,
                     r=0).sort().merge(d=1000, c=4,
                                       o="distinct"))  # noqa: E741
        b.saveas(str(gene_file))

        logger.info("processing data files")
        # create coverage_table
        df = coverage_table(
            enhancer_file,
            data_files,
            window=window,
            log_transform=True,
            normalization="quantile",
            ncpus=12,
        )

        df.index.rename("loc", inplace=True)
        df.reset_index().to_feather(f"{outdir}/enhancers.feather")
        np.savez(target_file, target=df.iloc[:, 0].sort_values())
        meanstd = pd.DataFrame(index=df.index, )
        meanstd["mean"] = df.mean(1)
        meanstd["std"] = df.std(1)
        meanstd = meanstd.reset_index().rename(columns={"loc": "index"})

        meanstd.to_csv(meanstd_file, compression="gzip", index=False, sep="\t")
        df.index.rename("loc", inplace=True)
        df = df.sub(df.mean(1), axis=0)
        df = df.div(df.std(1), axis=0)
        df.reset_index().to_feather(f"{outdir}/enhancers.feather")

        link = create_link_file(meanstd_file, gene_file, genome=genome)
        link.to_feather(link_file)

        genes = _create_gene_table(
            df,
            meanstd_file,
            gene_file,
            gene_mapping,
            genome=genome,
            link_file=link_file,
            threshold=threshold,
        )
        genes.to_csv(f"{outdir}/genes.txt", sep="\t")

        with open(f"{outdir}/info.yaml", "w") as f:
            yaml.dump(info, f)

        return ScepiaDataset(outdir)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: promoters.py Projeto: bentonml/CNV_FunctionalAnnotation

import numpy as np

from scipy import stats
from pybedtools import BedTool


def filter_chipseq_files(infile):
    df = pd.read_table(infile,
                       sep='\t',
                       header=None,
                       names=[
                           'chrom', 'start', 'end', 'name', 'score', 'strand',
                           'signal', 'p', 'q', 'peak'
                       ])
    df['signal_zscore'] = stats.zscore(df.signal)
    fltr_df = df[df.signal_zscore > 1.64]
    return BedTool.from_dataframe(fltr_df)


# make promoter file | general promoter set
genes = BedTool('rnaseq_ensembl_genemodels_filter.bed')
tss_window = genes.flank(l=2000, r=0, s=True, genome='hg19')
tss_window.saveas('hg19_tss_2kb.bed')

# make promoter file | requires overlap with high signal H3K27ac/H3K4me3
h3k27ac = filter_chipseq_files('E073-H3K27ac.narrowPeak.gz')
h3k4me3 = filter_chipseq_files('E073-H3K4me3.narrowPeak.gz')
tss_high_conf = (tss_window.intersect(h3k27ac, u=True)).intersect(h3k4me3,
                                                                  u=True)
tss_high_conf.saveas('hg19_tss_high_conf.bed')