Пример #1
0
    def genePairTAD(genes, tads, fraction=0.7):
        """
        Annotate tads with genes and return as dict.

        Params:
        --------
        genes: `str` bed4 file of gene
        tads: `str` bed3 file of tad
        fraction: `str` or `float` fraction of gene 
                    overlap with tads [default: 0.7]
        
        Returns:
        --------
        out: `dict` dictionary of TADs annotation

        Examples:
        --------
        >>> db = TADConserved().getGene("gene.bed", "tad.bed")

        """

        check_file_exists(tads)
        check_file_exists(genes)
        if 0 > float(fraction) > 1:
            logging.error('The option `-f` must set in '
                          'range [0, 1], and you set {}'.format(fraction))
            sys.exit()

        bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-7 ".format(
            genes, tads, fraction)
        db = OrderedDict()
        for line in os.popen(bedtools_cmd):
            line_list = line.strip().split()
            gene, chrom, start, end = line_list
            ID = chrRangeID([chrom, start, end]) if chrom != "." \
                else "."
            if ID == ".":
                continue

            db[gene] = ID

        return db
Пример #2
0
def testPipe(args):
    """
    test pipe for tad annotate.
    """
    db = OrderedDict()
    if not sys.stdin.isatty():
        handle = sys.stdin
    else:
        pass

    for line in sys.stdin:
        line_list = line.strip().split()
        ID = chrRangeID(line_list[:3])
        gene = line_list[3]
        if ID not in db:
            db[ID] = []
        db[ID].append(gene)

    for ID in db:
        print(ID + "\t" + ",".join(db[ID]))
Пример #3
0
def annotate(args):
    """
    %prog tad.bed gene.bed [Options]
    Annotate tads with gene.
    """

    p = OptionParser(annotate.__doc__)
    p.add_option('-F',
                 dest='fraction',
                 default='0.7',
                 help='the fraction of gene overlap of tads'
                 ' [default: %default]')
    p.add_option('--isnum',
                 default=False,
                 action='store_true',
                 help='if output the gene number [default: %default]')
    p.add_option('--plot',
                 default=False,
                 action='store_true',
                 help='if plot the gene number '
                 'distribution [default: %default]')
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    tads, genes = args
    fraction = opts.fraction
    db = TADConserved().getGene(tads, genes, fraction, opts.isnum, opts.plot)
    if opts.isnum:
        return
    for ID in db:
        gene_list = sorted(db[ID])
        length = len(gene_list) if "." not in gene_list else 0
        print("\t".join(chrRangeID(ID, axis=1)) + "\t" + \
            ",".join(gene_list) + "\t" + \
            str(length), file=sys.stdout)
Пример #4
0
    def getGene(tads, genes, fraction=0.7, isnum=False, isPlot=False):
        """
        Annotate tads with genes and return as dict.

        Params:
        --------
        tads: `str` bed3 file of tad
        genes: `str` bed4 file of gene
        fraction: `str` or `float` fraction of gene 
                    overlap with tads [default: 0.7]
        isnum: `bool` if set output the gene number instead 
                    of gene list. [default: False]
        isPlot: `bool` if plot the gene number per TADs 
                    distribution. [default: False]

        Returns:
        --------
        out: `dict` dictionary of TADs annotation

        Examples:
        --------
        >>> db = TADConserved().getGene("tad.bed", "gene.bed")
        """
        check_file_exists(tads)
        check_file_exists(genes)
        if 0 > float(fraction) > 1:
            logging.error('The option `-F` must set in '
                          'range [0, 1], and you set {}'.format(fraction))
            sys.exit()

        bedtools_cmd = "bedtools intersect -a {} -b {} -wao -F {} | \
                         cut -f 1-3,7 ".format(tads, genes, fraction)
        db = OrderedDict()

        for line in os.popen(bedtools_cmd):
            line_list = line.strip().split()
            ID = chrRangeID(line_list[:3])
            gene = line_list[3]
            if ID not in db:
                db[ID] = set()
            db[ID].add(gene)

        if isnum:
            for ID in db:
                db[ID] = len(db[ID])

        if isPlot:
            assert isnum, 'isnum must specify as True'
            fig, ax = plt.subplots(figsize=(5, 5))
            sns.distplot(db.values(), hist=False, kde=True, ax=ax)
            ax.set_xticks(range(0, 41, 5))
            ax.set_xlim(0, 40)
            ax.set_xlabel('Gene number')
            ax.set_ylabel('Frequence')
            ax.set_title('Gene Number Distribution ({:,})'.format(
                sum(db.values())))
            plt.savefig('{}.gene_num_dist.pdf'.format(genes.rsplit('.', 1)[0]),
                        dpi=300)
            logging.debug('Successful to plot gene number distribution '
                          '`{}.gene_num_dist.pdf`.'.format(
                              genes.rsplit('.', 1)[0]))

        return db