def main(): usage = 'usage: %prog [options] <gene/transcript id>' parser = OptionParser(usage) parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]') parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]') parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]') (options,args) = parser.parse_args() if options.transcript_expr: cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: trans_ids = set() for line in open(options.lnc_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['gene_id'] == args[0]: trans_ids.add(kv['transcript_id']) else: trans_ids = [args[0]] for trans_id in trans_ids: print '%s:' % trans_id cuff.gene_expr_print(trans_id) else: cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: gene_id = args[0] else: t2g = gff.t2g(options.lnc_gtf) gene_id = t2g[args[0]] cuff.gene_expr_print(gene_id)
def main(): usage = 'usage: %prog [options] <gff file>' parser = OptionParser(usage) parser.add_option('-c', dest='cons_dir', default='%s/research/common/data/phylop' % os.environ['HOME'], help='Conservation directory [Default: %default]') parser.add_option( '-l', dest='lncrna', action='store_true', default=False, help= 'Use the lncRNA specific file to speed things up [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gff file to intersect') else: gff_file = args[0] t2g = gff.t2g(gff_file) # build interval trees lnc_lengths = {} chr_features = {} interval2lnc = {} lnc_cons = {} for line in open(gff_file): a = line.split('\t') chrom = a[0] start = int(a[3]) end = int(a[4]) tid = gff.gtf_kv(a[8])['transcript_id'] align = (chrom, start, end) lnc_cons[tid] = [] lnc_lengths[tid] = lnc_lengths.get(tid, 0) + (end - start + 1) if interval2lnc.has_key(align): interval2lnc[align].add(tid) else: interval2lnc[align] = set([tid]) chr_features.setdefault(chrom, IntervalTree()).insert_interval( Interval(start, end)) # process overlapping chromosome blocks if options.lncrna: lnc_wig = glob.glob('%s/lnc_catalog.*wigFix*' % options.cons_dir)[0] process_file(chr_features, interval2lnc, lnc_cons, lnc_wig) else: for cons_file in glob.glob('%s/chr*' % options.cons_dir): process_file(chr_features, interval2lnc, lnc_cons, cons_file) # print table for tid in lnc_lengths: cons_len = len(lnc_cons[tid]) cons_cov = float(cons_len) / lnc_lengths[tid] if cons_len == 0: cons_mean = 0.0 cons_median = 0.0 cons_pos = 0.0 cons_neg = 0.0 else: cons_mean = stats.mean(lnc_cons[tid]) cons_median = stats.median(lnc_cons[tid]) cons_pos = len([c for c in lnc_cons[tid] if c > 1]) / float(cons_len) cons_neg = len([c for c in lnc_cons[tid] if c < 1]) / float(cons_len) cols = (tid, t2g[tid], lnc_lengths[tid], cons_cov, cons_mean, cons_median, cons_neg, cons_pos) print '%-15s %-15s %7d %9.4f %9.4f %9.4f %9.4f %9.4f' % cols
def main(): usage = 'usage: %prog [options] <gff file>' parser = OptionParser(usage) parser.add_option('-c', dest='cons_dir', default='%s/research/common/data/phylop' % os.environ['HOME'], help='Conservation directory [Default: %default]') parser.add_option('-l', dest='lncrna', action='store_true', default=False, help='Use the lncRNA specific file to speed things up [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gff file to intersect') else: gff_file = args[0] t2g = gff.t2g(gff_file) # build interval trees lnc_lengths = {} chr_features = {} interval2lnc = {} lnc_cons = {} for line in open(gff_file): a = line.split('\t') chrom = a[0] start = int(a[3]) end = int(a[4]) tid = gff.gtf_kv(a[8])['transcript_id'] align = (chrom,start,end) lnc_cons[tid] = [] lnc_lengths[tid] = lnc_lengths.get(tid,0) + (end-start+1) if interval2lnc.has_key(align): interval2lnc[align].add(tid) else: interval2lnc[align] = set([tid]) chr_features.setdefault(chrom, IntervalTree()).insert_interval(Interval(start,end)) # process overlapping chromosome blocks if options.lncrna: lnc_wig = glob.glob('%s/lnc_catalog.*wigFix*' % options.cons_dir)[0] process_file(chr_features, interval2lnc, lnc_cons, lnc_wig) else: for cons_file in glob.glob('%s/chr*' % options.cons_dir): process_file(chr_features, interval2lnc, lnc_cons, cons_file) # print table for tid in lnc_lengths: cons_len = len(lnc_cons[tid]) cons_cov = float(cons_len) / lnc_lengths[tid] if cons_len == 0: cons_mean = 0.0 cons_median = 0.0 cons_pos = 0.0 cons_neg = 0.0 else: cons_mean = stats.mean(lnc_cons[tid]) cons_median = stats.median(lnc_cons[tid]) cons_pos = len([c for c in lnc_cons[tid] if c > 1]) / float(cons_len) cons_neg = len([c for c in lnc_cons[tid] if c < 1]) / float(cons_len) cols = (tid, t2g[tid], lnc_lengths[tid], cons_cov, cons_mean, cons_median, cons_neg, cons_pos) print '%-15s %-15s %7d %9.4f %9.4f %9.4f %9.4f %9.4f' % cols
def main(): usage = 'usage: %prog [options] <ref gtf> <merged gtf>' parser = OptionParser(usage) (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: ref_gtf = args[0] merged_gtf = args[1] # get mappings ref_t2g = gff.t2g(ref_gtf) merged_t2g = gff.t2g(merged_gtf) merged_g2t = gff.g2t(merged_gtf) # hash gene_name's by tid ref_gid_names = {} for line in open(ref_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if 'gene_name' in kv: ref_gid_names[kv['gene_id']] = kv['gene_name'] # hash merged lines by tid merged_tid_lines = {} for line in open(merged_gtf): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] merged_tid_lines.setdefault(tid,[]).append(line) # intialize orphan gene_id orphan_num = 1 for mgene_id in merged_g2t: # count reference genes ref_genes = set() for tid in merged_g2t[mgene_id]: if tid in ref_t2g: ref_genes.add(ref_t2g[tid]) # if no known genes, leave it alone if len(ref_genes) == 0: for tid in merged_g2t[mgene_id]: print ''.join(merged_tid_lines[tid]), # if known gene, set gene_id to it elif len(ref_genes) == 1: new_gene_id = list(ref_genes)[0] for tid in merged_g2t[mgene_id]: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = new_gene_id if new_gene_id in ref_gid_names: kv['gene_name'] = ref_gid_names[new_gene_id] a[8] = gff.kv_gtf(kv) print '\t'.join(a) # if two known genes were combined, fix it elif len(ref_genes) > 1: # compute transcript overlaps and build overlap graph tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines) # map each new transcript to the ref gene_id's overlapped tid_ref_genes = {} for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 not in ref_t2g: tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1]) elif tid1 not in ref_t2g and tid2 in ref_t2g: tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2]) # remove new transcripts overlapping multiple ref gene_id's for tid in tid_ref_genes: if len(tid_ref_genes[tid]) > 1: print >> sys.stderr, 'Removing %s' % tid tid_overlap_graph.remove_node(tid) # remove edges connecting separate reference genes for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]: tid_overlap_graph.remove_edge(tid1,tid2) # map to new gene_id's; missing means eliminate transcript tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g) for tid in merged_g2t[mgene_id]: if tid in tid_new_gid: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = tid_new_gid[tid] if tid_new_gid[tid] in ref_gid_names: kv['gene_name'] = ref_gid_names[tid_new_gid[tid]] a[8] = gff.kv_gtf(kv) print '\t'.join(a)