def __init__( self, gff_filename: Union[str, Path], group_filename: Union[str, Path], internal_fuzzy_max_dist: int = 0, self_prefix: str = None, fastq_filename: Union[str, Path] = None, fusion_max_dist: int = 10, ): """ Differences with non-fusion MegaPBTree: 1. allow_5merge is always FALSE. Not a parameter. 2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions """ super().__init__( gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename, ) self.fusion_max_dist = fusion_max_dist # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2] self.record_d_fusion = { fusion_id: records for fusion_id, records in GFF.collapseGFFFusionReader(gff_filename) }
def add_sample( self, gff_filename: Union[str, Path], group_filename: Union[str, Path], sample_prefix: str, output_prefix: str, fastq_filename: Optional[Union[str, Path]] = None, ) -> None: combined = ( [] ) # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = list(self.record_d_fusion.keys()) for _, records in GFF.collapseGFFFusionReader(gff_filename): match_seqid = self.match_fusion_record(records) if match_seqid is not None: combined.append((self.record_d_fusion[match_seqid], records)) try: unmatched_recs.remove(match_seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, records)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d_fusion[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) for i, (r1s, r2s) in enumerate(combined): if r2s is None or (r1s is not None and r1s[0].end - r1s[0].start > r2s[0].end - r2s[0].start): final_tree[r1s[0].chr][r1s[0].strand].insert( r1s[0].start, r1s[0].end, i) else: final_tree[r2s[0].chr][r2s[0].strand].insert( r2s[0].start, r2s[0].end, i) self.write_cluster_tree_as_gff( final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename, )
def sample_sanity_check( group_filename: Union[str, Path], gff_filename: Union[str, Path], count_filename: Union[str, Path], fastq_filename: Optional[Union[str, Path]] = None, ) -> None: """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ logger.info( f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..." ) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [ fusion_id for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename) ] with open(count_filename) as f: for _ in range(14): f.readline() # just through the header ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")] if len({ids2}.difference(ids1)) > 0 or len({ids2 }.difference(ids3)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {gff_filename} and {count_filename}" ) if fastq_filename is not None: ids4 = [ r.id.split("|")[0] for r in SeqIO.parse(fastq_filename, "fastq") ] if len({ids2}.difference(ids4)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}" )