def main(input_prefix): input_gff = input_prefix + '.gff' if not os.path.exists(input_gff): print("Looking for input GFF {0} but not found! Abort!".format( input_gff)) sys.exit(-1) f1 = open(input_prefix + '.simple_stats.txt', 'w') f1.write("pbid\tlocus\tlength\tnum_exon\n") f2 = open(input_prefix + '.exon_stats.txt', 'w') f2.write("pbid\texon_index\texon_size\tintron_size\n") for r in collapseGFFReader(input_gff): f1.write(r.seqid + '\t') f1.write(r.seqid.split('.')[1] + '\t') sum_len = 0 for i, e in enumerate(r.ref_exons): exon_len = e.end - e.start sum_len += exon_len f2.write("{0}\t{1}\t{2}\t".format(r.seqid, i + 1, exon_len)) if i == 0: f2.write("NA\n") else: f2.write(str(e.start - r.ref_exons[i - 1].end) + '\n') f1.write(str(sum_len) + '\t') f1.write(str(len(r.ref_exons)) + '\n') f1.close() f2.close() print("Output written to: {0},{1}\n".format(f1.name, f2.name))
def calc_ontarget_rate(tree, gene_info, input_fasta, is_gtf, sam_or_gtf, output_filename=None): type = 'fasta' if input_fasta.upper().endswith( '.FA') or input_fasta.upper().endswith('.FASTA') else 'fastq' query_len_dict = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), type)) if output_filename is None: f = sys.stdout else: f = open(output_filename, 'w') FIELDS = [ 'read_id', 'read_len', 'num_probe', 'num_base_overlap', 'loci', 'genes' ] writer = DictWriter(f, FIELDS, delimiter='\t') writer.writeheader() if is_gtf: reader = collapseGFFReader(sam_or_gtf) for r in reader: num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { 'read_id': r.seqid, 'read_len': 'NA', 'num_probe': num_probe, 'num_base_overlap': base_hit, 'loci': "{0}:{1}-{2}".format(r.chr, r.start, r.end), 'genes': ",".join(genes_seen) } writer.writerow(rec) else: reader = BioReaders.GMAPSAMReader(sam_or_gtf, True, query_len_dict=query_len_dict) for r in reader: if r.sID == '*': continue num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { 'read_id': r.qID, 'read_len': r.qLen, 'num_probe': num_probe, 'num_base_overlap': base_hit, 'loci': "{0}:{1}-{2}".format(r.sID, r.sStart, r.sEnd), 'genes': ",".join(genes_seen) } writer.writerow(rec) f.close()
def read_GFF(gff_filename, logf): """ Read a GFF filename and get the gene regions :return: dict of (PB.X) --> LocusInfo """ gff_info = {} # loci --> LocusInfo tmp = {} # loci PB.X --> list of GFF records for PB.X.Y for r in collapseGFFReader(gff_filename): m = rex_pbid.match(r.seqid) if m is None: raise Exception("Expected PBID format PB.X.Y but saw {0}".format( r.seqid)) locus = m.group(1) # ex: PB.1 if locus not in tmp: tmp[locus] = [r] gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None) else: if gff_info[locus].chrom != r.chr: logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\ r.seqid, gff_info[locus].chrom, r.chr)) tmp[locus].append(r) # now figure out the exonic regions for each gene PB.X for locus, records in tmp.items(): c = ClusterTree(0, 0) for r in records: for e in r.ref_exons: c.insert(max(0, e.start - extra_bp_around_junctions), e.end + extra_bp_around_junctions, 1) regions = [(a, b) for (a, b, junk) in c.getregions()] regions[0] = (max(0, regions[0][0] - __padding_before_after__), regions[0][1]) regions[-1] = (max(0, regions[-1][0]), regions[-1][1] + __padding_before_after__) gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom, strand=gff_info[locus].strand, regions=regions, isoforms=[r.seqid for r in records]) return gff_info
def read_annotation_for_junction_info(gff_filename): """ :param gff_filename: annotation GFF :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based. """ d = defaultdict(lambda: set()) for r in collapseGFFReader(gff_filename): if r.strand == '+': for i in xrange(0, len(r.ref_exons) - 1): d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i].end - 1) d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i + 1].start) else: for i in xrange(0, len(r.ref_exons) - 1): d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i].end - 1) d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i + 1].start) for k in d: d[k] = list(d[k]) d[k].sort() return d
def read_annotation_for_junction_info(gff_filename): """ :param gff_filename: annotation GFF :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based. """ d = defaultdict(lambda: set()) for r in collapseGFFReader(gff_filename): if r.strand == '+': for i in xrange(0, len(r.ref_exons)-1): d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i].end-1) d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i+1].start) else: for i in xrange(0, len(r.ref_exons)-1): d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i].end-1) d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i+1].start) for k in d: d[k] = list(d[k]) d[k].sort() return d
def read_GFF(gff_filename, logf): """ Read a GFF filename and get the gene regions :return: dict of (PB.X) --> LocusInfo """ gff_info = {} # loci --> LocusInfo tmp = {} # loci PB.X --> list of GFF records for PB.X.Y for r in collapseGFFReader(gff_filename): m = rex_pbid.match(r.seqid) if m is None: raise Exception, "Expected PBID format PB.X.Y but saw {0}".format(r.seqid) locus = m.group(1) # ex: PB.1 if locus not in tmp: tmp[locus] = [r] gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None) else: if gff_info[locus].chrom!=r.chr: logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\ r.seqid, gff_info[locus].chrom, r.chr)) tmp[locus].append(r) # now figure out the exonic regions for each gene PB.X for locus, records in tmp.iteritems(): c = ClusterTree(0, 0) for r in records: for e in r.ref_exons: c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1) regions = [(a,b) for (a,b,junk) in c.getregions()] regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__) gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom, strand=gff_info[locus].strand, regions=regions, isoforms=[r.seqid for r in records]) return gff_info
def collate_info(fusion_prefix, class_filename, genepred_filename, total_fl_count=None, config_filename=None, genome_dict=None, cds_gff_filename=None, min_fl_count=2): global_info = {} # holding information for general information if config_filename is not None: print("Reading config file {0}...".format(config_filename), file=sys.stdout) for line in open(config_filename): k, v = line.strip().split('=') global_info[k] = v gene_to_id = {} # gene name --> ensembl ID for line in open(genepred_filename): raw = line.strip().split() gene_to_id[raw[11]] = raw[0] d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record orf_dict = {} # read SQANTI3 classification file for r in DictReader(open(class_filename), delimiter='\t'): m = fusion_pbid.match(r['isoform']) if m is None: print( "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!", file=sys.stderr) sys.exit(-1) gene_index, isoform_index = m.group(1), m.group(2) d[gene_index][isoform_index] = r orf_dict[r['isoform']] = r['ORF_seq'] # get sequences seq_dict = dict( (r.id.split('|')[0], r.seq) for r in SeqIO.parse(open(fusion_prefix + '.rep.fa'), 'fasta')) # get count information count_d = defaultdict(lambda: 'NA') count_filename = fusion_prefix + '.abundance.txt' if os.path.exists(count_filename): for r in DictReader(open(count_filename), delimiter='\t'): count_d[r['pbid']] = int(r['count_fl']) if total_fl_count is None: print( "Total FL count not given --- using the sum FL count from fusions only instead.", file=sys.stdout) total_fl_count = sum(count_d.values()) # get breakpoint information gff_d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record if cds_gff_filename is None: gff_filename = fusion_prefix + '.gff' else: gff_filename = cds_gff_filename for r in collapseGFFReader(gff_filename): m = fusion_pbid.match(r.seqid) if m is None: print( "ERROR: fusion pbid in {0} must follow format `PBfusion.X.Y`. Abort!" .format(gff_filename), file=sys.stderr) sys.exit(-1) gene_index, isoform_index = m.group(1), int(m.group(2)) gff_d[gene_index][isoform_index] = r if r.strand not in ('+', '-'): print("ERROR: fusion {0} did not specify strand in {1}! Abort!". format(r.seqid, gff_filename)) sys.exit(-1) fields2 = list(global_info.keys()) + FIELDS f = open(fusion_prefix + '.annotated.txt', 'w') f_bad = open(fusion_prefix + '.annotated_ignored.txt', 'w') writer = DictWriter(f, fields2, delimiter=',') writer.writeheader() writer_bad = DictWriter(f_bad, fields2, delimiter=',') writer_bad.writeheader() for gene_index, iso_dict in d.items(): iso_dict = list( iso_dict.items()) # (isoform index, classification record) iso_dict.sort(key=lambda x: x[0]) has_novel = any(r['associated_gene'].startswith('novelGene') or r['associated_gene'] == '' for junk, r in iso_dict) pbid = 'PBfusion.' + str(gene_index) gff_info = list(gff_d[gene_index].items()) gff_info.sort(key=lambda x: x[0]) rec1 = gff_info[0][1] rec2 = gff_info[-1][1] left_breakpoint, left_seq, right_breakpoint, right_seq = \ get_breakpoint_n_seq(rec1, rec2, genome_dict) left_exon_count = len(rec1.ref_exons) right_exon_count = len(rec2.ref_exons) gene1 = iso_dict[0][1]['associated_gene'] gene2 = iso_dict[-1][1]['associated_gene'] if cds_gff_filename is not None: left_cds_exon_count = len(rec1.cds_exons) right_cds_exon_count = len(rec2.cds_exons) else: left_cds_exon_count = 'NA' right_cds_exon_count = 'NA' left_orf, right_orf = 'NA', 'NA' if orf_dict is not None: seqid1 = gff_info[0][1].seqid seqid2 = gff_info[-1][1].seqid left_orf = orf_dict[seqid1] right_orf = orf_dict[seqid2] info = { 'UniqueID': pbid, 'FusionName': "--".join([_r['associated_gene'] for (_index, _r) in iso_dict]), 'LeftGeneName': gene1, 'LeftGeneID': gene_to_id[gene1] if gene1 in gene_to_id else 'NA', 'LeftBreakpoint': left_breakpoint, 'LeftFlankingSequence': left_seq, 'RightGeneName': gene2, 'RightGeneID': gene_to_id[gene2] if gene2 in gene_to_id else 'NA', 'RightBreakpoint': right_breakpoint, 'RightFlankingSequence': right_seq, 'JunctionSupport': 'NA', 'SpanningReads': count_d[pbid], 'ReadCountScore': count_d[pbid] * (10**6) / total_fl_count if count_d[pbid] is not 'NA' else 'NA', 'Sequence': seq_dict[pbid], 'LeftORF': left_orf, 'RightORF': right_orf, 'LeftExonCount': left_exon_count, 'RightExonCount': right_exon_count, 'LeftCDSExonCount': left_cds_exon_count, 'RightCDSExonCount': right_cds_exon_count } info.update(global_info) if has_novel or \ gene1==gene2 or \ (info['SpanningReads']!='NA' and info['SpanningReads'] < min_fl_count): writer_bad.writerow(info) else: writer.writerow(info) f.close()
def make_file_for_subsample(input_prefix, output_prefix, demux_file=None, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False): """ Two files must exist: .abundance.txt and .rep.fq so we can make the length """ count_filename = input_prefix + '.abundance.txt' fq_filename = input_prefix + '.rep.fq' if not include_single_exons: from cupcake.io.GFF import collapseGFFReader gff_filename = input_prefix + '.gff' print("Reading {0} to exclude single exons...".format(gff_filename), file=sys.stderr) good_ids = [] for r in collapseGFFReader(gff_filename): if len(r.ref_exons) >= 2: good_ids.append(r.seqid) if demux_file is None and not os.path.exists(count_filename): print("Cannot find {0}. Abort!".format(count_filename), file=sys.stderr) sys.exit(-1) if not os.path.exists(fq_filename): print("Cannot find {0}. Abort!".format(fq_filename), file=sys.stderr) sys.exit(-1) if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed): print("Cannot find {0}. Abort!".format(matchAnnot_parsed), file=sys.stderr) sys.exit(-1) if sqanti_class is not None and not os.path.exists(sqanti_class): print("Cannot find {0}. Abort!".format(sqanti_class), file=sys.stderr) sys.exit(-1) if matchAnnot_parsed is not None: match_dict = dict( (r['pbid'], r) for r in DictReader(open(matchAnnot_parsed), delimiter='\t')) for k in match_dict: match_dict[k]['category'] = match_dict[k]['score'] elif sqanti_class is not None: print("Reading {0} to get gene/isoform assignment...".format( sqanti_class), file=sys.stderr) match_dict = {} for r in DictReader(open(sqanti_class), delimiter='\t'): if r['associated_transcript'] == 'novel': refisoform = 'novel_' + r['isoform'] else: refisoform = r['associated_transcript'] match_dict[r['isoform']] = { 'refgene': r['associated_gene'], 'refisoform': refisoform, 'category': r['structural_category'] } else: match_dict = None seqlen_dict = dict((r.id.split('|')[0], len(r.seq)) for r in SeqIO.parse(open(fq_filename), 'fastq')) to_write = {} if demux_file is None: to_write['all'] = {} f = open(count_filename) while True: cur = f.tell() if not f.readline().startswith('#'): f.seek(cur) break for r in DictReader(f, delimiter='\t'): if r['pbid'] in good_ids or include_single_exons: to_write['all'][r['pbid']] = r['count_fl'] else: d, samples = read_demux_fl_count_file(demux_file) for s in samples: to_write[s] = {} for pbid, d2 in d.items(): for s in samples: if pbid in good_ids or include_single_exons: to_write[s][pbid] = d2[s] for sample in to_write: h = open(output_prefix + '.' + sample + '.txt', 'w') if matchAnnot_parsed is None and sqanti_class is None: h.write("pbid\tpbgene\tlength\tfl_count\n") else: h.write( "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n" ) for pbid in to_write[sample]: if matchAnnot_parsed is not None or sqanti_class is not None: if pbid not in match_dict: print( "Ignoring {0} because not on annotation (SQANTI/MatchAnnot) file." .format(pbid), file=sys.stdout) continue m = match_dict[pbid] h.write("{0}\t{1}\t{2}\t".format(pbid, pbid.split('.')[1], seqlen_dict[pbid])) h.write("{0}\t{1}\t{2}\t".format(m['refisoform'], m['refgene'], m['category'])) else: h.write("{0}\t{1}\t{2}\t".format(pbid, pbid.split('.')[1], seqlen_dict[pbid])) h.write("{0}\n".format(to_write[sample][pbid])) h.close() print("Output written to {0}.".format(h.name), file=sys.stderr)
def sqanti_filter_lite(args): fafq_type = 'fasta' with open(args.isoforms) as h: if h.readline().startswith('@'): fafq_type = 'fastq' prefix = args.sqanti_class[:args.sqanti_class.rfind('.')] fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w') fcsv.write("# classification: {0}\n".format(args.sqanti_class)) fcsv.write("# isoform: {0}\n".format(args.isoforms)) fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming)) fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov)) fcsv.write("filtered_isoform,reason\n") fout = open(prefix + '.filtered_lite.' + fafq_type, 'w') seqids_to_keep = set() total_count = 0 for r in DictReader(open(args.sqanti_class), delimiter='\t'): total_count += 1 filter_flag, filter_msg = False, "" percA = float(r['perc_A_downstream_TTS']) / 100 assert 0 <= percA <= 1 runA = 0 while runA < len(r['seq_A_downstream_TTS']): if r['seq_A_downstream_TTS'][runA] != 'A': break runA += 1 min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None num_exon = int(r['exons']) is_RTS = r['RTS_stage'] == 'TRUE' is_canonical = r['all_canonical'] == 'canonical' is_monoexonic = (num_exon == 1) cat = CATEGORY_DICT[r['structural_category']] potential_intrapriming = (percA >= args.intrapriming or runA >= args.runAlength) and \ r['polyA_motif'] == 'NA' and \ (r['diff_to_gene_TSS'] == 'NA' or abs( int(r['diff_to_gene_TTS'])) > args.max_dist_to_known_end) if cat in ['FSM']: if potential_intrapriming: filter_flag, filter_msg = True, "IntraPriming" elif args.filter_mono_exonic and is_monoexonic: filter_flag, filter_msg = True, "Mono-Exonic" else: if potential_intrapriming: filter_flag, filter_msg = True, "IntraPriming" elif args.filter_mono_exonic and is_monoexonic: filter_flag, filter_msg = True, "Mono-Exonic" elif is_RTS: filter_flag, filter_msg = True, "RTSwitching" elif (not is_canonical) and (min_cov is None or (min_cov is not None and min_cov < args.min_cov)): filter_flag, filter_msg = True, "LowCoverage/Non-Canonical" if not filter_flag: seqids_to_keep.add(r['isoform']) else: fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg)) print("{0} isoforms read from {1}. {2} to be kept.".format( total_count, args.sqanti_class, len(seqids_to_keep)), file=sys.stdout) if not args.skipFaFq: for r in SeqIO.parse(open(args.isoforms), fafq_type): if r.id in seqids_to_keep: SeqIO.write(r, fout, fafq_type) fout.close() print("Output written to: {0}".format(fout.name), file=sys.stdout) # write out a new .classification.txt, .junctions.txt outputClassPath = prefix + '.filtered_lite_classification.txt' with open(outputClassPath, 'w') as f: reader = DictReader(open(args.sqanti_class), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print("Output written to: {0}".format(f.name), file=sys.stdout) if not args.skipJunction: outputJuncPath = prefix + '.filtered_lite_junctions.txt' with open(outputJuncPath, 'w') as f: reader = DictReader(open( args.sqanti_class.replace('_classification', '_junctions')), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print("Output written to: {0}".format(f.name), file=sys.stdout) if not args.skipGTF: outputGTF = prefix + '.filtered_lite.gtf' with open(outputGTF, 'w') as f: for r in collapseGFFReader(args.gtf_file): if r.seqid in seqids_to_keep: write_collapseGFF_format(f, r) print("Output written to: {0}".format(f.name), file=sys.stdout) if args.sam is not None: outputSam = prefix + '.filtered_lite.sam' with open(outputSam, 'w') as f: reader = GMAPSAMReader(args.sam, True) f.write(reader.header) for r in reader: if r.qID in seqids_to_keep: f.write(r.record_line + '\n') print("Output written to: {0}".format(f.name), file=sys.stdout) if args.faa is not None: outputFAA = prefix + '.filtered_lite.faa' with open(outputFAA, 'w') as f: for r in SeqIO.parse(open(args.faa), 'fasta'): if r.id in seqids_to_keep: f.write(">{0}\n{1}\n".format(r.description, r.seq)) print("Output written to: {0}".format(f.name), file=sys.stdout) print("**** Generating SQANTI3 report....", file=sys.stderr) cmd = RSCRIPTPATH + " {d}/{f} {c} {j} {p} {d}".format(d=utilitiesPath, f=RSCRIPT_REPORT, c=outputClassPath, j=outputJuncPath, p="mock") if subprocess.check_call(cmd, shell=True) != 0: print("ERROR running command: {0}".format(cmd), file=sys.stderr) sys.exit(-1)
def collate_info(fusion_prefix, class_filename, gtf_filename, total_fl_count=None, config_filename=None, genome_dict=None, cds_gff_filename=None, min_fl_count=2, min_breakpoint_dist_kb=10, include_Mt_genes=False): global_info = {} # holding information for general information if config_filename is not None: print("Reading config file {0}...".format(config_filename), file=sys.stdout) for line in open(config_filename): k, v = line.strip().split('=') global_info[k] = v # in order to get gene name to ensembl gene ID (ENSG), we need the original GTF that was fed to SQANTI3 gene_to_id = defaultdict(lambda: set()) # gene name --> ensembl ID print(f"Reading {gtf_filename} to extract gene name to ENSG ID mapping...") gtf_info = GTF(gtf_filename) for v in gtf_info.transcript_info.values(): gene_to_id[v['gname']].add(v['gid']) for k in gene_to_id: gene_to_id[k] = "_".join(gene_to_id[k]) d = defaultdict(lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record orf_dict = {} # read SQANTI3 classification file for r in DictReader(open(class_filename), delimiter='\t'): m = fusion_pbid.match(r['isoform']) if m is None: print("ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!", file=sys.stderr) sys.exit(-1) gene_index, isoform_index = m.group(1), m.group(2) d[gene_index][isoform_index] = r orf_dict[r['isoform']] = r['ORF_seq'] # get sequences seq_dict = dict((r.id.split('|')[0], r.seq) for r in SeqIO.parse(open(fusion_prefix + '.rep.fa'),'fasta')) # get count information count_d = defaultdict(lambda: 'NA') count_filename = fusion_prefix + '.abundance.txt' if os.path.exists(count_filename): for r in DictReader(open(count_filename), delimiter='\t'): count_d[r['pbid']] = int(r['count_fl']) if total_fl_count is None: print("Total FL count not given --- using the sum FL count from fusions only instead.", file=sys.stdout) total_fl_count = sum(count_d.values()) # get breakpoint information gff_d = defaultdict(lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record if cds_gff_filename is None: gff_filename = fusion_prefix + '.gff' else: gff_filename = cds_gff_filename for r in collapseGFFReader(gff_filename): m = fusion_pbid.match(r.seqid) if m is None: print("ERROR: fusion pbid in {0} must follow format `PBfusion.X.Y`. Abort!".format(gff_filename), file=sys.stderr) sys.exit(-1) gene_index, isoform_index = m.group(1), int(m.group(2)) gff_d[gene_index][isoform_index] = r if r.strand not in ('+','-'): print("ERROR: fusion {0} did not specify strand in {1}! Abort!".format(r.seqid, gff_filename)) sys.exit(-1) fields2 = list(global_info.keys()) + FIELDS f = open(fusion_prefix + '.annotated.txt', 'w') f_bad = open(fusion_prefix + '.annotated_ignored.txt', 'w') writer = DictWriter(f, fields2, delimiter=',') writer.writeheader() writer_bad = DictWriter(f_bad, fields2, delimiter=',') writer_bad.writeheader() for gene_index, iso_dict in d.items(): iso_dict = list(iso_dict.items()) # (isoform index, classification record) iso_dict.sort(key=lambda x: x[0]) has_novel = any(r['associated_gene'].startswith('novelGene') or r['associated_gene']=='' for junk,r in iso_dict) pbid = 'PBfusion.' + str(gene_index) gff_info = list(gff_d[gene_index].items()) gff_info.sort(key=lambda x: x[0]) rec1 = gff_info[0][1] rec2 = gff_info[-1][1] left_breakpoint, left_seq, right_breakpoint, right_seq = \ get_breakpoint_n_seq(rec1, rec2, genome_dict) left_exon_count = len(rec1.ref_exons) right_exon_count = len(rec2.ref_exons) gene1 = iso_dict[0][1]['associated_gene'] gene2 = iso_dict[-1][1]['associated_gene'] if cds_gff_filename is not None: left_cds_exon_count = len(rec1.cds_exons) right_cds_exon_count = len(rec2.cds_exons) else: left_cds_exon_count = 'NA' right_cds_exon_count = 'NA' left_orf, right_orf = 'NA', 'NA' if orf_dict is not None: seqid1 = gff_info[0][1].seqid seqid2 = gff_info[-1][1].seqid left_orf = orf_dict[seqid1] right_orf = orf_dict[seqid2] info = {'UniqueID': pbid, 'FusionName': "--".join([_r['associated_gene'] for (_index,_r) in iso_dict]), 'LeftGeneName': gene1, 'LeftGeneID': gene_to_id[gene1] if gene1 in gene_to_id else 'NA', 'LeftBreakpoint': left_breakpoint, 'LeftFlankingSequence': left_seq, 'RightGeneName': gene2, 'RightGeneID': gene_to_id[gene2] if gene2 in gene_to_id else 'NA', 'RightBreakpoint': right_breakpoint, 'RightFlankingSequence': right_seq, 'JunctionSupport': 'NA', 'SpanningReads': count_d[pbid], 'ReadCountScore': count_d[pbid]*(10**6)/total_fl_count if count_d[pbid] is not 'NA' else 'NA', 'Sequence': seq_dict[pbid], 'LeftORF': left_orf, 'RightORF': right_orf, 'LeftExonCount': left_exon_count, 'RightExonCount': right_exon_count, 'LeftCDSExonCount': left_cds_exon_count, 'RightCDSExonCount': right_cds_exon_count, 'Comments': 'PASS'} info.update(global_info) left_chr, left_break, left_strand = left_breakpoint.split(':') right_chr, right_break, right_strand = right_breakpoint.split(':') if has_novel: info['Comments'] = 'FAIL:NovelGene' elif gene1==gene2: info['Comments'] = 'FAIL:SameGene' elif (info['SpanningReads']!='NA' and info['SpanningReads'] < min_fl_count): info['Comments'] = 'FAIL:TooFewFLReads' elif (not include_Mt_genes and (gene1.startswith('MT-') or gene2.startswith('MT-'))): info['Comments'] = 'FAIL:MtGenes' elif (left_chr==right_chr and abs(int(left_break)-int(right_break))/1000<=min_breakpoint_dist_kb): info['Comments'] = 'FAIL:BreakpointTooClose' # elif (left_exon_count==1 and left_orf=='NA'): # info['Comments'] = 'PASS:LeftExonNoORF' # elif (right_exon_count==1 and right_orf=='NA'): # info['Comments'] = 'PASS:RightExonNoORF' if info['Comments'].startswith('FAIL:'): writer_bad.writerow(info) else: writer.writerow(info) f.close()
def make_file_for_subsample(input_prefix, output_filename, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False): """ Two files must exist: .abundance.txt and .rep.fq so we can make the length """ count_filename = input_prefix + '.abundance.txt' fq_filename = input_prefix + '.rep.fq' if not include_single_exons: from cupcake.io.GFF import collapseGFFReader gff_filename = input_prefix + '.gff' print >> sys.stderr, "Reading {0} to exclude single exons...".format( gff_filename) good_ids = [] for r in collapseGFFReader(gff_filename): if len(r.ref_exons) >= 2: good_ids.append(r.seqid) if not os.path.exists(count_filename): print >> sys.stderr, "Cannot find {0}. Abort!".format(count_filename) sys.exit(-1) if not os.path.exists(fq_filename): print >> sys.stderr, "Cannot find {0}. Abort!".format(fq_filename) sys.exit(-1) if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed): print >> sys.stderr, "Cannot find {0}. Abort!".format( matchAnnot_parsed) sys.exit(-1) if sqanti_class is not None and not os.path.exists(sqanti_class): print >> sys.stderr, "Cannot find {0}. Abort!".format(sqanti_class) sys.exit(-1) if matchAnnot_parsed is not None: match_dict = dict( (r['pbid'], r) for r in DictReader(open(matchAnnot_parsed), delimiter='\t')) elif sqanti_class is not None: print >> sys.stderr, "Reading {0} to get gene/isoform assignment...".format( sqanti_class) match_dict = {} for r in DictReader(open(sqanti_class), delimiter='\t'): if r['associated_transcript'] == 'novel': refisoform = 'novel_' + r['isoform'] else: refisoform = r['associated_transcript'] match_dict[r['isoform']] = { 'refgene': r['associated_gene'], 'refisoform': refisoform } else: match_dict = None seqlen_dict = dict((r.id.split('|')[0], len(r.seq)) for r in SeqIO.parse(open(fq_filename), 'fastq')) h = open(output_filename, 'w') if matchAnnot_parsed is None and sqanti_class is None: h.write("pbid\tpbgene\tlength\tfl_count\n") else: h.write("pbid\tpbgene\tlength\trefisoform\trefgene\tfl_count\n") f = open(count_filename) while True: cur = f.tell() if not f.readline().startswith('#'): f.seek(cur) break for r in DictReader(f, delimiter='\t'): if not include_single_exons and r['pbid'] not in good_ids: print >> sys.stderr, "Exclude {0} because single exon.".format( r['pbid']) continue h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1], seqlen_dict[r['pbid']])) if matchAnnot_parsed is not None or sqanti_class is not None: m = match_dict[r['pbid']] h.write("{0}\t{1}\t".format(m['refisoform'], m['refgene'])) h.write("{0}\n".format(r['count_fl'])) h.close() print >> sys.stderr, "Output written to {0}.".format(output_filename)
def main(corrected_csv, cluster_info, output_prefix, fasta_file=None, gff_file=None, faa_file=None): # read corrected CSV reader = DictReader(open(corrected_csv), delimiter='\t') for k in CORRECTED_CSV_FILELDS: if k not in reader.fieldnames: print("The following fields must exist in {0}!\n{1}".format( corrected_csv, "\n".join(CORRECTED_CSV_FILELDS))) sys.exit(-1) per_unique = {} # tag -> record per_unique_count = Counter() # tag -> number of duplicates per_pbid = defaultdict(lambda: { 'gene': None, 'transcript': None, 'clusters': [] }) # pbid --> list of clusters it is in for r in reader: tag = "{bc}-{umi}-{gene}".format(bc=r['BC_ed'], umi=r['UMI_ed'], gene=r['gene']) per_unique[tag] = r per_unique_count[tag] += 1 # now link barcode to cell type, also PCR dup counts for tag in per_unique: c = cluster_info[per_unique[tag]['BC_ed']] rec = per_unique[tag] rec['cluster'] = c rec['num_dups'] = per_unique_count[tag] pbid = rec['pbid'] if pbid in per_pbid: per_pbid[pbid]['clusters'].add(c) else: per_pbid[pbid] = { 'gene': rec['gene'], 'transcript': rec['transcript'], 'clusters': set([c]) } # write out de-dup CSV file with open(output_prefix + '.csv', 'w') as f: writer = DictWriter(f, CORRECTED_CSV_FILELDS + ['cluster', 'num_dups'], delimiter='\t', extrasaction='ignore') writer.writeheader() keys = per_unique.keys() for k in sorted(keys): writer.writerow(per_unique[k]) if fasta_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(output_prefix + '.fasta', 'w') as f: for r in SeqIO.parse(open(fasta_file), 'fasta'): if r.id in per_pbid: newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\ pbid=r.id, gene=per_pbid[r.id]['gene'], transcript=per_pbid[r.id]['transcript'], clusters=";".join(per_pbid[r.id]['clusters'])) f.write(">{0}\n{1}\n".format(newid, r.seq)) for c in per_pbid[r.id]['clusters']: if c not in f_d: f_d[c] = open( "{o}.{c}.fasta".format(o=output_prefix, c=c), 'w') f_d[c].write(">{0}\n{1}\n".format(newid, r.seq)) if faa_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(output_prefix + '.faa', 'w') as f: for r in SeqIO.parse(open(faa_file), 'fasta'): if r.id in per_pbid: newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\ pbid=r.id, gene=per_pbid[r.id]['gene'], transcript=per_pbid[r.id]['transcript'], clusters=";".join(per_pbid[r.id]['clusters'])) f.write(">{0}\n{1}\n".format(newid, r.seq)) for c in per_pbid[r.id]['clusters']: if c not in f_d: f_d[c] = open( "{o}.{c}.faa".format(o=output_prefix, c=c), 'w') f_d[c].write(">{0}\n{1}\n".format(newid, r.seq)) for handle in f_d.values(): handle.close() if gff_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(output_prefix + '.gff', 'w') as f: for r in collapseGFFReader(gff_file): if r.seqid in per_pbid: newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\ pbid=r.seqid, gene=per_pbid[r.seqid]['gene'], transcript=per_pbid[r.seqid]['transcript'], clusters=";".join(per_pbid[r.seqid]['clusters'])) write_collapseGFF_format(f, r) for c in per_pbid[r.seqid]['clusters']: if c not in f_d: f_d[c] = open( "{o}.{c}.gff".format(o=output_prefix, c=c), 'w') write_collapseGFF_format(f_d[c], r) for handle in f_d.values(): handle.close()
def make_file_for_subsample(input_prefix, output_filename, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False): """ Two files must exist: .abundance.txt and .rep.fq so we can make the length """ count_filename = input_prefix + '.abundance.txt' fq_filename = input_prefix + '.rep.fq' if not include_single_exons: from cupcake.io.GFF import collapseGFFReader gff_filename = input_prefix + '.gff' print >> sys.stderr, "Reading {0} to exclude single exons...".format(gff_filename) good_ids = [] for r in collapseGFFReader(gff_filename): if len(r.ref_exons) >= 2: good_ids.append(r.seqid) if not os.path.exists(count_filename): print >> sys.stderr, "Cannot find {0}. Abort!".format(count_filename) sys.exit(-1) if not os.path.exists(fq_filename): print >> sys.stderr, "Cannot find {0}. Abort!".format(fq_filename) sys.exit(-1) if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed): print >> sys.stderr, "Cannot find {0}. Abort!".format(matchAnnot_parsed) sys.exit(-1) if sqanti_class is not None and not os.path.exists(sqanti_class): print >> sys.stderr, "Cannot find {0}. Abort!".format(sqanti_class) sys.exit(-1) if matchAnnot_parsed is not None: match_dict = dict((r['pbid'],r) for r in DictReader(open(matchAnnot_parsed), delimiter='\t')) elif sqanti_class is not None: print >> sys.stderr, "Reading {0} to get gene/isoform assignment...".format(sqanti_class) match_dict = {} for r in DictReader(open(sqanti_class), delimiter='\t'): if r['associated_transcript'] == 'novel': refisoform = 'novel_'+r['isoform'] else: refisoform = r['associated_transcript'] match_dict[r['isoform']] = {'refgene': r['associated_gene'], 'refisoform': refisoform} else: match_dict = None seqlen_dict = dict((r.id.split('|')[0],len(r.seq)) for r in SeqIO.parse(open(fq_filename),'fastq')) h = open(output_filename, 'w') if matchAnnot_parsed is None and sqanti_class is None: h.write("pbid\tpbgene\tlength\tfl_count\n") else: h.write("pbid\tpbgene\tlength\trefisoform\trefgene\tfl_count\n") f = open(count_filename) while True: cur = f.tell() if not f.readline().startswith('#'): f.seek(cur) break for r in DictReader(f, delimiter='\t'): if not include_single_exons and r['pbid'] not in good_ids: print >> sys.stderr, "Exclude {0} because single exon.".format(r['pbid']) continue if matchAnnot_parsed is not None or sqanti_class is not None: if r['pbid'] not in match_dict: print >> sys.stdout, "Ignoring {0} because not on annotation (SQANTI/MatchAnnot) file.".format(r['pbid']) continue m = match_dict[r['pbid']] h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1], seqlen_dict[r['pbid']])) h.write("{0}\t{1}\t".format(m['refisoform'], m['refgene'])) else: h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1], seqlen_dict[r['pbid']])) h.write("{0}\n".format(r['count_fl'])) h.close() print >> sys.stderr, "Output written to {0}.".format(output_filename)