def read_group_file(group_filename, is_cid=True, sample_prefixes=None): """ Make the connection between partitioned results and final (ex: PB.1.1) The partitioned results could either be ICE cluster (ex: i1_c123) or CCS Return: dict of seq_or_ice_cluster --> collapsed cluster ID """ cid_info = {} # ex: i1 --> c123 --> PB.1.1, or None --> c123 --> PB.1.1 if sample_prefixes is not None: for sample_prefix in sample_prefixes: cid_info[sample_prefix] = {} else: cid_info[None] = {} reader = GroupReader(group_filename) for group in reader: pbid, members = group.name, group.members for cid in members: # ex: x is 'i1_c123/f3p0/123 or # m131116_014707_42141_c100591062550000001823103405221462_s1_p0/93278/31_1189_CCS if sample_prefixes is None: if is_cid: cid = cid.split('/')[0] cid_info[None][cid] = pbid else: if any( cid.startswith(sample_prefix + '|') for sample_prefix in sample_prefixes): sample_prefix, cid = cid.split('|', 1) if is_cid: cid = cid.split('/')[0] cid_info[sample_prefix][cid] = pbid reader.close() return cid_info
def read_group_file(group_filename, is_cid=True, sample_prefixes=None): """ Make the connection between partitioned results and final (ex: PB.1.1) The partitioned results could either be ICE cluster (ex: i1_c123) or CCS Return: dict of seq_or_ice_cluster --> collapsed cluster ID """ cid_info = {} # ex: i1 --> c123 --> PB.1.1, or None --> c123 --> PB.1.1 if sample_prefixes is not None: for sample_prefix in sample_prefixes: cid_info[sample_prefix] = {} else: cid_info[None] = {} reader = GroupReader(group_filename) for group in reader: pbid, members = group.name, group.members for cid in members: # ex: x is 'i1_c123/f3p0/123 or # m131116_014707_42141_c100591062550000001823103405221462_s1_p0/93278/31_1189_CCS if sample_prefixes is None: if is_cid: cid = cid.split('/')[0] cid_info[None][cid] = pbid else: if any(cid.startswith(sample_prefix + '|') for sample_prefix in sample_prefixes): sample_prefix, cid = cid.split('|', 1) if is_cid: cid = cid.split('/')[0] cid_info[sample_prefix][cid] = pbid reader.close() return cid_info
def good_isoform_ids_by_count(in_group_filename, in_abundance_filename, min_count): """Return a list of collapsed isoforms ids whose supportive FL count >= min_count. Parameters: in_group_filename -- group file of collapsed isoforms in_abundance_filename -- abundance file of collapsed isoforms min_count -- min number of supportive FL reads to be 'good' """ # read group file group_max_count_fl = {} group_max_count_nfl = {} with GroupReader(in_group_filename) as g_reader: for g in g_reader: pbid, members = g.name, g.members group_max_count_fl[pbid] = 0 group_max_count_nfl[pbid] = 0 for m in members: s = SampleIsoformName.fromString(m) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], s.num_fl) group_max_count_nfl[pbid] = max(group_max_count_nfl[pbid], s.num_nfl) # read abundance to decide good collapsed isoforms based on count good = [ r.pbid for r in AbundanceReader(in_abundance_filename) if r.count_fl >= min_count and group_max_count_fl[r.pbid] >= min_count ] return good
def read_group(group_filename, group_prefix): """read a group file and group_prefix to a dict if group_prefix is None: return {group.pbid --> group.members} else: {group.pbid --> [group_prefix+'|'+m for m in group.members] """ return { group.name: group.members for group in GroupReader(group_filename, group_prefix) }
def run_after(self, rtc, output_dir): rep_fn = rtc.task.output_files[0] gff_fn = rtc.task.output_files[1] abundance_fn = rtc.task.output_files[2] group_fn = rtc.task.output_files[3] read_stat_fn = rtc.task.output_files[4] from pbcore.io import FastqReader from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65) self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65) self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65) self.assertEqual(len([r for r in GroupReader(group_fn)]), 86) self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def collapse_fuzzy_junctions(gff_filename, group_filename, fuzzy_gff_filename, fuzzy_group_filename, allow_extra_5exon, max_fuzzy_junction): """ Collapses those transcripts in gff_filename which have fuzzy junctions. Returns fuzzy_match Parameters: gff_filename -- input unfuzzy gff filename group_filename -- input unfuzzy group filename fuzzy_gff_filename -- output gff filename in which transcripts with fuzzy junctions are further collapsed. fuzzy_group_filename -- output group filename allow_etra_5exon -- whether or not to allow extra 5 exons max_fuzzy_junction -- maximum differences to call two exons match """ d = {} # seqid --> GmapRecord recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict( lambda: []) # seqid --> [seqid of fuzzy match GmapRecords] for r in CollapseGffReader(gff_filename): # r : a GmapRecord which represents a transcript and its associated exons. d[r.seqid] = r has_match = False for r2 in recs[r.chr][r.strand].find(r.start, r.end): # Compare r1 with r2 and get match pattern, exact, super, subset, partial or nonmatch m = compare_fuzzy_junctions(r.ref_exons, r2.ref_exons, max_fuzzy_junction=max_fuzzy_junction) if can_merge(m, r, r2, allow_extra_5exon=allow_extra_5exon, max_fuzzy_junction=max_fuzzy_junction): logging.debug("Collapsing fuzzy transcript %s to %s", r.seqid, r2.seqid) fuzzy_match[r2.seqid].append(r.seqid) # collapse r to r2 has_match = True break if not has_match: logging.debug("No fuzzy transcript found for %s", r.seqid) recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] # Get group info from input group_filename group_info = { group.name: group.members for group in GroupReader(group_filename) } # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) fuzzy_gff_writer = CollapseGffWriter(fuzzy_gff_filename) fuzzy_group_writer = GroupWriter(fuzzy_group_filename) for k in keys: # Iterates over each group of fuzzy match GmapRecords all_members = [] # Assume the first GmapRecord is the best to represent this fuzzy match GmapRecords group best_pbid = fuzzy_match[k][0] # e.g., PB.1.1 if not best_pbid in group_info: raise ValueError("Could not find %s in Group file %s" % (best_pbid, group_filename)) best_size, best_num_exons = len(group_info[best_pbid]), len( d[best_pbid].ref_exons) all_members += group_info[best_pbid] for pbid in fuzzy_match[k][ 1:]: # continue to look for better representative if not pbid in group_info: raise ValueError("Could not find %s in Group file %s" % (pbid, group_filename)) _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons # Write the best GmapRecord of the group to fuzzy_gff_filename fuzzy_gff_writer.writeRecord(d[best_pbid]) # Write all members of the group to fuzzy_group_filename fuzzy_group_writer.writeRecord(GroupRecord(best_pbid, all_members)) fuzzy_gff_writer.close() fuzzy_group_writer.close() return fuzzy_match