def genePairTAD(genes, tads, fraction=0.7): """ Annotate tads with genes and return as dict. Params: -------- genes: `str` bed4 file of gene tads: `str` bed3 file of tad fraction: `str` or `float` fraction of gene overlap with tads [default: 0.7] Returns: -------- out: `dict` dictionary of TADs annotation Examples: -------- >>> db = TADConserved().getGene("gene.bed", "tad.bed") """ check_file_exists(tads) check_file_exists(genes) if 0 > float(fraction) > 1: logging.error('The option `-f` must set in ' 'range [0, 1], and you set {}'.format(fraction)) sys.exit() bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-7 ".format( genes, tads, fraction) db = OrderedDict() for line in os.popen(bedtools_cmd): line_list = line.strip().split() gene, chrom, start, end = line_list ID = chrRangeID([chrom, start, end]) if chrom != "." \ else "." if ID == ".": continue db[gene] = ID return db
def testPipe(args): """ test pipe for tad annotate. """ db = OrderedDict() if not sys.stdin.isatty(): handle = sys.stdin else: pass for line in sys.stdin: line_list = line.strip().split() ID = chrRangeID(line_list[:3]) gene = line_list[3] if ID not in db: db[ID] = [] db[ID].append(gene) for ID in db: print(ID + "\t" + ",".join(db[ID]))
def annotate(args): """ %prog tad.bed gene.bed [Options] Annotate tads with gene. """ p = OptionParser(annotate.__doc__) p.add_option('-F', dest='fraction', default='0.7', help='the fraction of gene overlap of tads' ' [default: %default]') p.add_option('--isnum', default=False, action='store_true', help='if output the gene number [default: %default]') p.add_option('--plot', default=False, action='store_true', help='if plot the gene number ' 'distribution [default: %default]') opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) tads, genes = args fraction = opts.fraction db = TADConserved().getGene(tads, genes, fraction, opts.isnum, opts.plot) if opts.isnum: return for ID in db: gene_list = sorted(db[ID]) length = len(gene_list) if "." not in gene_list else 0 print("\t".join(chrRangeID(ID, axis=1)) + "\t" + \ ",".join(gene_list) + "\t" + \ str(length), file=sys.stdout)
def getGene(tads, genes, fraction=0.7, isnum=False, isPlot=False): """ Annotate tads with genes and return as dict. Params: -------- tads: `str` bed3 file of tad genes: `str` bed4 file of gene fraction: `str` or `float` fraction of gene overlap with tads [default: 0.7] isnum: `bool` if set output the gene number instead of gene list. [default: False] isPlot: `bool` if plot the gene number per TADs distribution. [default: False] Returns: -------- out: `dict` dictionary of TADs annotation Examples: -------- >>> db = TADConserved().getGene("tad.bed", "gene.bed") """ check_file_exists(tads) check_file_exists(genes) if 0 > float(fraction) > 1: logging.error('The option `-F` must set in ' 'range [0, 1], and you set {}'.format(fraction)) sys.exit() bedtools_cmd = "bedtools intersect -a {} -b {} -wao -F {} | \ cut -f 1-3,7 ".format(tads, genes, fraction) db = OrderedDict() for line in os.popen(bedtools_cmd): line_list = line.strip().split() ID = chrRangeID(line_list[:3]) gene = line_list[3] if ID not in db: db[ID] = set() db[ID].add(gene) if isnum: for ID in db: db[ID] = len(db[ID]) if isPlot: assert isnum, 'isnum must specify as True' fig, ax = plt.subplots(figsize=(5, 5)) sns.distplot(db.values(), hist=False, kde=True, ax=ax) ax.set_xticks(range(0, 41, 5)) ax.set_xlim(0, 40) ax.set_xlabel('Gene number') ax.set_ylabel('Frequence') ax.set_title('Gene Number Distribution ({:,})'.format( sum(db.values()))) plt.savefig('{}.gene_num_dist.pdf'.format(genes.rsplit('.', 1)[0]), dpi=300) logging.debug('Successful to plot gene number distribution ' '`{}.gene_num_dist.pdf`.'.format( genes.rsplit('.', 1)[0])) return db