def __init__(
        self,
        gff_filename: Union[str, Path],
        group_filename: Union[str, Path],
        internal_fuzzy_max_dist: int = 0,
        self_prefix: str = None,
        fastq_filename: Union[str, Path] = None,
        fusion_max_dist: int = 10,
    ):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super().__init__(
            gff_filename,
            group_filename,
            internal_fuzzy_max_dist,
            self_prefix,
            False,
            fastq_filename,
        )

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = {
            fusion_id: records
            for fusion_id, records in GFF.collapseGFFFusionReader(gff_filename)
        }
    def add_sample(
        self,
        gff_filename: Union[str, Path],
        group_filename: Union[str, Path],
        sample_prefix: str,
        output_prefix: str,
        fastq_filename: Optional[Union[str, Path]] = None,
    ) -> None:
        combined = (
            []
        )  # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = list(self.record_d_fusion.keys())

        for _, records in GFF.collapseGFFFusionReader(gff_filename):
            match_seqid = self.match_fusion_record(records)
            if match_seqid is not None:
                combined.append((self.record_d_fusion[match_seqid], records))
                try:
                    unmatched_recs.remove(match_seqid)
                except ValueError:
                    pass  # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, records))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d_fusion[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {
            "+": ClusterTree(0, 0),
            "-": ClusterTree(0, 0)
        })
        for i, (r1s, r2s) in enumerate(combined):
            if r2s is None or (r1s is not None and r1s[0].end - r1s[0].start >
                               r2s[0].end - r2s[0].start):
                final_tree[r1s[0].chr][r1s[0].strand].insert(
                    r1s[0].start, r1s[0].end, i)
            else:
                final_tree[r2s[0].chr][r2s[0].strand].insert(
                    r2s[0].start, r2s[0].end, i)

        self.write_cluster_tree_as_gff(
            final_tree,
            combined,
            group_filename,
            sample_prefix,
            output_prefix,
            fastq_filename2=fastq_filename,
        )
def sample_sanity_check(
    group_filename: Union[str, Path],
    gff_filename: Union[str, Path],
    count_filename: Union[str, Path],
    fastq_filename: Optional[Union[str, Path]] = None,
) -> None:
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """

    logger.info(
        f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..."
    )
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [
        fusion_id
        for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename)
    ]
    with open(count_filename) as f:
        for _ in range(14):
            f.readline()  # just through the header
        ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")]
        if len({ids2}.difference(ids1)) > 0 or len({ids2
                                                    }.difference(ids3)) > 0:
            raise Exception(
                f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {gff_filename} and {count_filename}"
            )

    if fastq_filename is not None:
        ids4 = [
            r.id.split("|")[0] for r in SeqIO.parse(fastq_filename, "fastq")
        ]
        if len({ids2}.difference(ids4)) > 0:
            raise Exception(
                f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}"
            )