Exemplo n.º 1
0
def parse_matchAnnot(fa_or_fq,
                     filename,
                     not_pbid=False,
                     parse_FL_coverage=False):
    pbids = []
    fl_cov = {}  # only used if parse_FL_coverage is True
    for r in SeqIO.parse(open(fa_or_fq), type_fa_or_fq(fa_or_fq)):
        _id = r.id if not_pbid else r.id.split("|")[0]
        pbids.append(_id)
        if parse_FL_coverage:
            try:
                cov = int(
                    r.description.split("full_length_coverage=")[1].split(";")
                    [0])
                fl_cov[_id] = cov
            except:
                logger.error(
                    f"WARNING: Unable to extract `full_length_coverage=` from {r.description}. Mark as NA."
                )
                fl_cov[_id] = "NA"

    match = defaultdict(lambda:
                        (None, None, 0))  # ex: PB.1.1 -> (NOC2L, NOC2L-001, 5)

    for line in open(filename):
        i = line.find("result:")
        if i >= 0:
            raw = line[i:].strip().split()
            if len(raw) < 7:
                continue
            pbid = raw[1] if not_pbid else raw[1].split("|")[0]
            gene = raw[2]
            isoform = raw[3]
            score = int(raw[7])
            if score > match[pbid][1]:
                match[pbid] = (gene, isoform, score)

    f = open(f"{filename}.parsed.txt", "w")
    f.write("pbid\tpbgene\trefisoform\trefgene\tscore")
    if parse_FL_coverage:
        f.write("\tcount_fl")
    f.write("\n")
    for pbid in pbids:
        if not_pbid:
            pbpre = pbid
        else:
            pbpre = pbid.split(".")[1]
        _cov_text = f"\t{fl_cov[pbid]}" if parse_FL_coverage else ""
        if pbid not in match:
            f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n")
        else:
            gene, isoform, score = match[pbid]
            if gene is None:
                f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n")
            else:
                f.write(
                    f"{pbid}\t{pbpre}\t{isoform}\t{gene}\t{score}{_cov_text}\n"
                )
    f.close()
    logger.info(f"Output written to: {f.name}")
def get_abundance_post_collapse(
    group_file: Path,
    cluster_report_csv: Path,
    output_prefix: str,
    restricted_movies: Optional[List[str]] = None,
):
    """

    :param collapse_prefix: collapse prefix filename (must have .group.txt present)
    :param prefix_dict:
    :param output_prefix:
    :param restricted_movies:
    :return:
    """

    if not group_file.exists():
        logger.error(f"File {group_file.name} does not exist. Abort!")
        sys.exit(-1)

    if not cluster_report_csv.exists():
        logger.error(f"File {cluster_report_csv.name} does not exist. Abort!")
        sys.exit(-1)

    cid_info = read_group_filename(group_file, is_cid=True)

    output_read_count_IsoSeq_csv(cid_info, cluster_report_csv,
                                 f"{output_prefix}.read_stat.txt")
    logger.info(f"Read stat file written to {output_prefix}.read_stat.txt")
    make_abundance_file(
        f"{output_prefix}.read_stat.txt",
        f"{output_prefix}.abundance.txt",
        restricted_movies=restricted_movies,
    )
    logger.info(f"Abundance file written to {output_prefix}.abundance.txt")
 def match_fusion_record(
         self, records: List[GFF.gmapRecord]) -> Optional[GFF.gmapRecord]:
     """
     records --- in order, the records of a single fusion.
     """
     good = []
     # match the first record, requiring additionally that the precise 3' end matches
     cands = self.match_record_to_tree(records[0],
                                       check_5_dist=False,
                                       check_3_dist=True)
     # for each candidate (ex: PB.8.1, extract the full set of records and match them)
     for cand in cands:
         m = seqid_rex.match(cand)
         fusion_id = m.group(1)
         if self.check_records_match(records,
                                     self.record_d_fusion[fusion_id]):
             good.append(fusion_id)
     if len(good) == 0:
         return None
     elif len(good) == 1:
         return good[0]
     else:
         logger.error(
             "ERROR! more than one possible candidate in match_fusion_record! DEBUG."
         )
         logger.error(f"MATCHED: {good}")
         sys.exit(-1)
Exemplo n.º 4
0
def iter_gmap_sam_for_fusion(gmap_sam_filename, fusion_candidates,
                             transfrag_len_dict):
    """
    Iterate through a sorted GMAP SAM file
    Continuously yield a group of overlapping records {'+': [r1, r2, ...], '-': [r3, r4....]}
    """
    records = []
    iterator = BioReaders.GMAPSAMReader(gmap_sam_filename,
                                        True,
                                        query_len_dict=transfrag_len_dict)
    for r in iterator:
        if r.qID in fusion_candidates:
            records = [r]
            break

    for r in iterator:
        if len(records) >= 1 and (r.sID == records[-1].sID
                                  and r.sStart < records[-1].sStart):
            logger.error("ERROR: SAM file is NOT sorted. ABORT!")
            sys.exit(-1)
        if len(records) >= 1 and (r.sID != records[0].sID
                                  or r.sStart > records[-1].sEnd):
            yield (sep_by_strand(records))
            records = []
        if r.qID in fusion_candidates:
            records.append(r)

    if len(records) > 0:
        yield (sep_by_strand(records))
def sanity_check_seqids(seqids: List[str]):
    for seqid in seqids:
        m = seqid_rex.match(seqid)
        if m is None:
            logger.error(
                f"Expected ID format (ex: PB.1.2) not followed by {seqid}! Abort!"
            )
            sys.exit(-1)
Exemplo n.º 6
0
def brangus(vcf_filename, out_filename, unzip_snps=None):
    if unzip_snps is None:
        unzip_snps = defaultdict(lambda: {})
        for r in vcfpy.Reader(vcf_filename):
            unzip_snps[r.CHROM][r.POS] = r

    logger.info(f"Finished reading {vcf_filename}")
    with open(out_filename, "w") as out_f:
        FIELDS = [
            "dir",
            "chrom",
            "pos",
            "strand",
            "ref",
            "alt_Short",
            "alt_PB",
            "in_Short",
            "in_PB",
            "cov_Short",
            "cov_PB",
            "genomic_HP",
        ]
        writer = DictWriter(out_f, FIELDS, delimiter="\t")
        writer.writeheader()
        dirs = glob.glob("by_loci/*size*/")
        for d1 in dirs:
            mpileup = Path(d1, "ccs.mpileup")
            mapfile = Path(d1, "fake.mapping.txt")
            vcffile = Path(d1, "phased.partial.vcf")
            config = Path(d1, "config")
            nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND")
            if not vcffile.exists():
                if not nosnp.exists():
                    logger.error(f"Skipping {d1} because no SNPs found.")
            else:
                logger.info(f"Evaluating {d1}.")
                strand = "NA"
                if config.exists():  # find the strand this gene family is on
                    for line in open(config):
                        if line.startswith("ref_strand="):
                            strand = line.strip().split("=")[1]
                good_positions, cov_at_pos = get_positions_to_recover(
                    mapfile, mpileup, unzip_snps, min_cov=30)
                name = d1.split("/")[1]
                eval_isophase(
                    vcffile,
                    unzip_snps,
                    good_positions,
                    cov_at_pos,
                    {},
                    {},
                    writer,
                    name,
                    strand,
                )

    return
Exemplo n.º 7
0
def main(
    input_csv: str = typer.Argument(..., help="Input CSV"),
    output_csv: str = typer.Argument(..., help="Output CSV"),
    bc_rank_file: str = typer.Argument(
        ..., help="Cell barcode rank file from short read data"
    ),
    only_top_ranked: bool = typer.Option(
        False,
        help="Only output those that are top-ranked. Must have --bc_rank_file.",
    ),
    dropseq_clean_report: str = typer.Option(
        ...,
        help="Output from running DetectBeadSubstitutionErrors in DropSeq cookbook (ex: star_gene_exon_tagged_clean_substitution.bam_report.txt)",
    ),
    dropseq_synthesis_report: str = typer.Option(
        ...,
        help="Output from running DetectBeadSynthesisErrors in DropSeq cookbook (ex: star_gene_exon_tagged_clean_substitution_clean2.bam_report.txt)",
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    shortread_bc = {}  # dict of cell barcode -> "Y" for top ranked
    if bc_rank_file is not None:
        reader = DictReader(open(bc_rank_file), delimiter="\t")
        for r in reader:
            shortread_bc[r["cell_barcode"]] = r["top_ranked"]
    else:
        if only_top_ranked:
            logger.error("--bc_rank_file must be given if using --only_top_ranked!")
            sys.exit(-1)

    bc_repair_dict = None
    if dropseq_clean_report is not None:
        bc_repair_dict = read_dropseq_clean_report(dropseq_clean_report)
    if dropseq_synthesis_report is not None:
        bc_repair_dict = read_dropseq_synthesis_report(
            dropseq_synthesis_report, bc_repair_dict
        )

    umi_bc_error_correct(
        input_csv,
        output_csv,
        shortread_bc,
        only_top_ranked,
        bc_repair_dict,
    )
Exemplo n.º 8
0
def main(
    fasta_filename: str = typer.Argument(...),
    output_prefix: str = typer.Argument(...),
    copy: int = typer.Option(
        1,
        help="Number of copies to simulate per input sequence (default: 1)",
    ),
    ins: float = typer.Option(
        0,
        "--ins",
        "-i",
        help="Insert error rate [0-1] (default: 0)",
    ),
    dele: float = typer.Option(
        0,
        "--dele",
        "-d",
        help="Deletion error rate [0-1] (default: 0)",
    ),
    sub: float = typer.Option(
        0,
        "--sub",
        "-s",
        help="Substitution error rate [0-1] (default: 0)",
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    if sub < 0 or sub > 1:
        logger.error("Substitution error must be between 0-1!")
        sys.exit(-1)
    if ins < 0 or ins > 1:
        logger.error("Insertion error must be between 0-1!")
        sys.exit(-1)
    if dele < 0 or dele > 1:
        logger.error("Deletion error must be between 0-1!")
        sys.exit(-1)

    if sub + ins + dele > 1:
        logger.error("Total sub+ins+del error cannot exceed 1!")
        sys.exit(-1)

    profile = [sub, sub + ins, ins + dele, 1.0]

    fasta_filename = Path(fasta_filename)
    idpre = output_prefix

    ith = 0
    for r in SeqIO.parse(open(fasta_filename), "fasta"):
        for _ in range(copy):
            ith += 1
            print(
                f">{idpre}_{ith}_{r.id[:r.id.find('|')]}\n{sim_seq(r.seq.tostring(), profile)}"
            )
def get_roi_len(seqid: str):
    # before isoseq3: <movie>/<zmw>/<start>_<end>_CCS
    # for isoseq3: <movie>/<zmw>/ccs
    if seqid.endswith("/ccs"):
        logger.info(
            "WARNING: isoseq3 format detected. Output `length` column will be `NA`."
        )
        return "NA"
    elif not seqid.endswith("_CCS"):
        logger.error(
            "Sequence ID format must be <movie>/<zmw>/<start>_<end>_CCS or <movie>/<zmw>/ccs! Abort!"
        )
        sys.exit(-1)
    s, e, junk = seqid.split("/")[2].split("_")
    return abs(int(s) - int(e))
Exemplo n.º 10
0
def main(
    genome_fasta: str = typer.Argument(..., help="Reference genome fasta"),
    flnc_filename: str = typer.Argument(..., help="FLNC fastq file"),
    gff_filename: str = typer.Argument(
        ..., help="GFF file of transcripts, IDs must be PB.X.Y"),
    stat_filename: str = typer.Argument(
        ..., help="Tab-delimited read stat file linking FLNC to PB.X.Y"),
    coverage: int = typer.Option(
        40,
        "--coverage",
        "-c",
        help="Minimum FLNC coverage required (default: 40)",
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    if Path("by_loci").exists() and Path("by_loci").is_dir():
        logger.error(
            "Directory by_loci/ already exists. Delete before running!")
        sys.exit(-1)

    if not Path(genome_fasta).exists():
        logger.error(f"Cannot find genome FASTA {genome_fasta}. Abort!")
        sys.exit(-1)

    if not Path(flnc_filename).exists():
        logger.error(f"Cannot find FLNC file {flnc_filename}. Abort!")
        sys.exit(-1)

    if not Path(gff_filename).exists():
        logger.error(f"Cannot find GFF file {gff_filename}. Abort!")
        sys.exit(-1)

    if not Path(stat_filename).exists():
        logger.error(f"Cannot find Stat file {stat_filename}. Abort!")
        sys.exit(-1)

    logger.info(f"Reading genome fasta {genome_fasta}...")
    genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_fasta), "fasta"))

    select_loci_to_phase(genome_d, gff_filename, stat_filename, flnc_filename,
                         coverage)
def main(
    config: Union[str, Path] = typer.Argument(..., help="Config filename"),
    output_prefix: str = typer.Argument(..., help="Output prefix"),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):

    try:
        (
            sample_dirs,
            gff_filename,
            genome_filename,
            junction_filename,
        ) = read_config(config)
    except FileNotFoundError as error:
        logger.error(error)

    sanity_check(sample_dirs, gff_filename, genome_filename, junction_filename)

    if genome_filename is not None:
        logger.info(f"Reading genome file {genome_filename}...")
        genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta"))
    else:
        logger.info("No genome file given. Ignore.")
        genome_d = None

    if junction_filename is not None:
        logger.info(f"Reading junction file {junction_filename}....")
        junction_bed = read_annotation_junction_bed(junction_filename)
    else:
        logger.info("No junction file given. Ignore.")
        junction_bed = None

    summarize_junctions(
        sample_dirs,
        gff_filename,
        output_prefix,
        genome_d,
        junction_bed,
    )
def read_config(
        filename: Path) -> Tuple[Dict[str, Path], List[str], Path, Path, Path]:
    """
    SAMPLE=<name>;<path>

    must also have
    GFF_FILENAME=

    optional:
    GENOME_FILENAME=
    JUNCTION_FILENAME=
    GROUP_FILENAME=

    Everything else will be ignored (so you can re-use sample.config for chain_samples.py)
    """
    sample_dirs: Dict[str, Path] = {}
    sample_names: List[str] = []
    gff_filename: Optional[Union[str, Path]] = None
    genome_filename: Optional[Union[str, Path]] = None
    junction_filename: Optional[Union[str, Path]] = None

    if not filename.exists():
        raise FileNotFoundError(
            f"The config file {filename} could not be found!")

    with open(filename) as f:
        for line in f:
            if line.startswith("tmpSAMPLE="):
                logger.error(
                    "Please only use SAMPLE=, not tmpSAMPLE= for junction reports!"
                )
                sys.exit(-1)
            elif line.startswith("SAMPLE="):
                name, path = line.strip()[len("SAMPLE="):].split(";")
                if name.startswith("tmp_"):
                    logger.error(
                        f"Sample names are not allowed to start with tmp_! Please change {name} to something else."
                    )
                    sys.exit(-1)
                sample_dirs[name] = Path(path).resolve()
                sample_names.append(name)
            elif line.startswith("GFF_FILENAME="):
                gff_filename = Path(line.strip()[len("GFF_FILENAME="):])
            elif line.startswith("GENOME_FILENAME="):
                genome_filename = Path(line.strip()[len("GENOME_FILENAME="):])
            elif line.startswith("JUNCTION_FILENAME="):
                junction_filename = Path(
                    line.strip()[len("JUNCTION_FILENAME="):])

    if gff_filename is None:
        raise Exception(
            f"Expected GFF_FILENAME= but not in config file {filename}! Abort."
        )

    if len(sample_names) == 0:
        logger.error("No samples given. Exit.")
        sys.exit(-1)

    return sample_dirs, gff_filename, genome_filename, junction_filename
Exemplo n.º 13
0
def main(
    bam_filename: str = typer.Argument(
        ..., help="CCS BAM with cDNA primer removed (post LIMA)"),
    output_prefix: str = typer.Argument(..., help="Output prefix"),
    umi_len: int = typer.Option(..., "-u", "--umi_len", help="Length of UMI"),
    bc_len: int = typer.Option(...,
                               "-b",
                               "--bc_len",
                               help="Length of cell barcode"),
    tso_len: int = typer.Option(0,
                                "-t",
                                "--tso_len",
                                help="Length of TSO (for G5-10X only)"),
    umi_type: umi_types = typer.Option(..., help="Location of the UMI"),
    g5_clip_seq: Optional[str] = typer.Option(
        None, help="Sequence before UMI for G5-clip (for G5-clip only)"),
    bc_rank_file: Optional[str] = typer.Option(
        None, help="(Optional) cell barcode rank file from short read data"),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):
    if bc_len < 0:
        logger.error("bc_len can't be a negative number!")
        sys.exit(-1)
    if umi_len < 0:
        logger.error("umi_len can't be a negative number!")
        sys.exit(-1)
    if umi_len + bc_len <= 0:
        logger.error("umi_len + bc_len must be at least 1 bp long!")
        sys.exit(-1)

    # ToDo: figure out later how to do top ranked barcodes for 10X data
    shortread_bc = {}  # dict of cell barcode -> "Y" for top ranked
    if bc_rank_file is not None:
        reader = DictReader(open(bc_rank_file), delimiter="\t")
        for r in reader:
            shortread_bc[r["cell_barcode"]] = r["top_ranked"]

    clip_out(
        bam_filename,
        umi_len,
        bc_len,
        output_prefix,
        umi_type,
        shortread_bc,
        tso_len,
        g5_clip_seq,
    )
Exemplo n.º 14
0
def sanity_check_collapse_input(count_filename: Path, gff_filename: Path,
                                rep_filename: Path,
                                sample_directory: Path) -> None:
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    # group_filename = f"{input_prefix}.group.txt"

    if not rep_filename.exists():
        raise RuntimeError(
            f"Input sequence file {rep_filename.name} not found. Abort!")
    if not count_filename.exists():
        raise RuntimeError(f"File {count_filename.name} not found. Abort!")
    if not gff_filename.exists():
        raise RuntimeError(f"File {gff_filename.name} not found. Abort!")
    if not sample_directory.exists():
        raise RuntimeError(
            f"The directory {sample_directory.name} not found. Abort!")

    rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta"

    pbids1 = {r.id for r in SeqIO.parse(open(rep_filename), rep_type)}
    pbids2 = {r.seqid for r in GFF.collapseGFFReader(gff_filename)}
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if (len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3)
            or len(pbids1) != len(pbids3)):
        logger.error(
            "The number of PBID records in the files disagree! Sanity check failed."
        )
        logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}")
        logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}")
        logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}")
        sys.exit(-1)

    return None
def sanity_check(
    sample_dirs: List[Dict[str, Union[str, Path]]],
    gff_filename: Union[str, Path],
    genome_filename: Optional[Union[str, Path]] = None,
    junction_filename: Optional[Union[str, Path]] = None,
) -> None:
    for d in sample_dirs.values():
        file = Path(d, gff_filename)
        if not file.exists():
            logger.error(f"Expected GFF file {file} does not exist. Abort!")
            sys.exit(-1)

    if genome_filename is not None and not Path(genome_filename).exists():
        logger.error(
            f"Genome file {genome_filename} given but does not exist. Abort!")
        sys.exit(-1)

    if junction_filename is not None and not Path(junction_filename).exists():
        logger.error(
            f"Junction file {junction_filename} given but does not exist. Abort!"
        )
        sys.exit(-1)
Exemplo n.º 16
0
def sanity_check_collapse_input(input_prefix: str) -> Tuple[Path, Path, Path]:
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    # group_filename =  f"{input_prefix}.group.txt"
    count_filename = Path(f"{input_prefix}.abundance.txt")
    gff_filename = Path(f"{input_prefix}.gff")
    rep_filename = Path(f"{input_prefix}.rep.fq")
    if not count_filename.exists():
        logger.error(f"File {count_filename} does not exist. Abort!")
        sys.exit(-1)
    if not gff_filename.exists():
        logger.error(f"File {gff_filename} does not exist. Abort!")
        sys.exit(-1)
    if not rep_filename.exists():
        logger.error(f"File {rep_filename} does not exist. Abort!")
        sys.exit(-1)

    pbids1 = {[r.id for r in SeqIO.parse(open(rep_filename, "r"), "fastq")]}
    pbids2 = {[r.seqid for r in GFF.collapseGFFReader(gff_filename)]}
    pbids3 = {read_count_file(count_filename)[0].keys()}

    if (
        len(pbids1) != len(pbids2)
        or len(pbids2) != len(pbids3)
        or len(pbids1) != len(pbids3)
    ):
        logger.error(
            "The number of PBID records in the files disagree! Sanity check failed."
        )
        logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}")
        logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}")
        logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}")
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename
Exemplo n.º 17
0
    def iter_gmap_sam(self, gmap_sam_filename: Union[str, Path],
                      ignored_fout: TextIOWrapper):
        """
        Iterate over a SORTED GMAP SAM file.
        Return a collection of records that overlap by at least 1 base.
        """
        def sep_by_clustertree(
            records: List[BioReaders.GMAPSAMRecord],
        ) -> List[BioReaders.GMAPSAMRecord]:
            tree = ClusterTree(0, 0)
            for i, r in enumerate(records):
                tree.insert(r.sStart, r.sEnd, i)
            result = []
            for *_, indices in tree.getregions():
                result.append([records[i] for i in indices])
            return result

        def sep_by_strand(
            records: List[BioReaders.GMAPSAMRecord],
        ) -> Dict[str, List[BioReaders.GMAPSAMRecord]]:
            """
            Note! Must further separate again within each strand. Because of initially processing
            the strands together, could've collapesd some genes.
            """
            output = {"+": [], "-": []}
            for r in records:
                output[r.flag.strand].append(r)
            # process + strand using ClusterTree
            output["+"] = sep_by_clustertree(output["+"])
            output["-"] = sep_by_clustertree(output["-"])
            return output

        gmap_sam_reader = BioReaders.GMAPSAMReader(
            filename=gmap_sam_filename,
            has_header=True,
            query_len_dict=self.transfrag_len_dict,
        )
        quality_alignments = self.get_quality_alignments(
            gmap_sam_reader=gmap_sam_reader, ignored_fout=ignored_fout)

        # find first acceptably mapped read
        try:
            records: List[BioReaders.GMAPSAMRecord] = [
                next(quality_alignments)
            ]
            max_end = records[0].sEnd
        except StopIteration:
            logger.error(f"No valid records from {gmap_sam_filename}!")
            return
        # go through remainder of alignments and group by subject ID
        for r in quality_alignments:
            if r.sID == records[0].sID and r.sStart < records[-1].sStart:
                logger.error("SAM file is NOT sorted. ABORT!")
                sys.exit(-1)
            if r.sID != records[0].sID or r.sStart > max_end:
                yield sep_by_strand(records)
                records = [r]
                max_end = r.sEnd
            else:
                records.append(r)
                max_end = max(max_end, r.sEnd)
        yield sep_by_strand(records)
Exemplo n.º 18
0
    def read(self):
        """
        GFF files
        (0) seqname
        (1) annotation source
        (2) feature: gene|transcript|CDS|exon|UTR
        (3) 1-based start # MUST CONVERT TO 0-based!!!
        (4) 1-based end
        (5) score (I think it's similarity for GMAP)
        (6) strand: +|-
        (7) phase
        (8) extra stuff (gene ID, transcript ID...)

        For gmap output, a series is delimited by '###' line
        """
        cur = self.f.tell()
        line = self.f.readline().strip()
        if self.f.tell() == cur:
            raise StopIteration("EOF reached!!")
        raw = line.strip().split("\t")
        while raw[0].startswith("#"):
            line = self.f.readline().strip()
            raw = line.strip().split("\t")

        if len(raw) == 0 or raw[0] == "":
            raise StopIteration("EOF reached!!")

        assert raw[2] == "gene"
        raw = self.f.readline().strip().split("\t")
        assert raw[2] == "mRNA"
        seqname = raw[0]
        strand = raw[6]
        for blob in raw[8].split(";"):
            if blob.startswith("coverage="):
                coverage = float(blob[9:])
            elif blob.startswith("identity="):
                identity = float(blob[9:])
            elif blob.startswith("Name="):
                seqid = blob[5:]

        rec = gmapRecord(seqname, coverage, identity, strand, seqid)

        cds_exons = []
        cds_seq_start = None
        cds_seq_end = None
        while True:
            line = self.f.readline().strip()
            if line.startswith("##"):
                rec.cds_exons = cds_exons
                rec.cds_seq_start = cds_seq_start
                rec.cds_seq_end = cds_seq_end
                return rec
            raw = line.split("\t")
            feature = raw[2]
            if feature == "exon":
                rstart1, rend1 = int(raw[3]), int(raw[4])
                score = float(raw[5])
                rstrand = raw[6]  # this is the strand on the reference genome
                for blob in raw[8].split(";"):
                    if blob.startswith("Target="):
                        # sstrand is the strand on the query sequence
                        _, sstart1, send1, sstrand = blob.split()
                        sstart1 = int(sstart1)
                        send1 = int(send1)
                        rec.sstrand = sstrand
                try:
                    rec.add_exon(rstart1 - 1, rend1, sstart1 - 1, send1,
                                 rstrand, score)
                except AssertionError:
                    logger.error(f"{rec.seqid} has non-colinear exons!")
                    while True:
                        line = self.f.readline().strip()
                        if line.startswith("##"):
                            return rec
                rec.strand = rstrand
            elif feature == "CDS":
                rstart1, rend1 = int(raw[3]), int(raw[4])
                cds_exons.append(Interval(rstart1 - 1, rend1))
                for blob in raw[8].split(";"):
                    if blob.startswith("Target="):
                        junk, sstart1, send1, sstrand = blob.split()
                        sstart1 = int(sstart1)
                        send1 = int(send1)
                        cds_seq_start = (sstart1 - 1 if cds_seq_start is None
                                         else cds_seq_start)
                        cds_seq_end = send1
            else:
                raise Exception(f"Not supposed to see type {feature} here!!")
Exemplo n.º 19
0
def read_config(
    filename: Union[str, Path]
) -> Tuple[Dict[str, Path], List[str], str, str, str, str]:
    # Okay, why is this a thing?  Why not just pass arguments?
    """
    tmpSAMPLE=<name>;<path>
    SAMPLE=<name>;<path>

    must also have
    GROUP_FILENAME=
    GFF_FILENAME=
    COUNT_FILENAME=

    optional:
    FASTQ_FILENAME=
    """
    sample_dirs = {}
    sample_names = []
    group_filename, gff_filename, count_filename = None, None, None
    fastq_filename = None

    no_more_tmp = False

    with open(filename) as f:
        for line in f:
            if line.startswith("tmpSAMPLE="):
                if no_more_tmp:
                    logger.error(
                        "Cannot have tmp_ samples after non-tmp_ samples! Abort!"
                    )
                    sys.exit(-1)
                name, path = line.strip()[len("tmpSAMPLE="):].split(";")
                if name.startswith("tmp_"):
                    logger.error(
                        f"Sample names are not allowed to start with tmp_! "
                        f"Please change {name} to something else.")
                    sys.exit(-1)
                sample_dirs[name] = Path(path).resolve()
                sample_names.append(f"tmp_{name}")
            elif line.startswith("SAMPLE="):
                no_more_tmp = True
                name, path = line.strip()[len("SAMPLE="):].split(";")
                if name.startswith("tmp_"):
                    logger.error(
                        f"Sample names are not allowed to start with tmp_! "
                        f"Please change {name} to something else.")
                    sys.exit(-1)
                sample_dirs[name] = Path(path).resolve()
                sample_names.append(name)
            elif line.startswith("GROUP_FILENAME="):
                group_filename = line.strip()[len("GROUP_FILENAME="):]
            elif line.startswith("GFF_FILENAME="):
                gff_filename = line.strip()[len("GFF_FILENAME="):]
            elif line.startswith("COUNT_FILENAME="):
                count_filename = line.strip()[len("COUNT_FILENAME="):]
            elif line.startswith("FASTQ_FILENAME="):
                fastq_filename = line.strip()[len("FASTQ_FILENAME="):]

    if group_filename is None:
        raise FileNotFoundError(
            f"Expected GROUP_FILENAME= but not in config file {filename}! Abort."
        )
    if count_filename is None:
        raise FileNotFoundError(
            f"Expected COUNT_FILENAME= but not in config file {filename}! Abort."
        )
    if gff_filename is None:
        raise FileNotFoundError(
            f"Expected GFF_FILENAME= but not in config file {filename}! Abort."
        )

    if len(sample_names) == 0:
        logger.error("No samples given. Exit.")
        sys.exit(-1)

    # return signature is:
    # sample_dirs    = Dict[sample_name, Path(sample_path)]
    # sample_names   = List[sample_name]
    # group_filename = str
    # gff_filename   = str
    # count_filename = str
    # fastq_filename = str

    # so, for the test data, we get:
    # sample_dirs = {
    #   'A': Path('tests/test_data/chaining/A'),
    #   'B': Path('tests/test_data/chaining/B')
    #   }
    # sample_names = ["A", "B"]
    # group_filename = touse.group.txt
    # gff_filename   = touse.gff
    # count_filename = touse.count.txt
    # fastq_filename = touse.rep.fq

    return (
        sample_dirs,
        sample_names,
        group_filename,
        gff_filename,
        count_filename,
        fastq_filename,
    )
Exemplo n.º 20
0
def collate_info(
    fusion_prefix: str,
    class_filename: str,
    genepred_filename: str,
    total_fl_count: Optional[int] = None,
    config_filename: Optional[str] = None,
    genome_dict: Optional[dict] = None,
    cds_gff_filename: Optional[str] = None,
    min_fl_count: int = 2,
    min_breakpoint_dist_kb: int = 10,
    include_Mt_genes: bool = False,
) -> None:

    global_info = {}  # holding information for general information
    if config_filename is not None:
        logger.info(f"Reading config file {config_filename}...")
        for line in open(config_filename):
            k, v = line.strip().split("=")
            global_info[k] = v

    gene_to_id = {}  # gene name --> ensembl ID
    for line in open(genepred_filename):
        raw = line.strip().split()
        gene_to_id[raw[11]] = raw[0]

    d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    orf_dict = {}
    # read SQANTI3 classification file
    for r in DictReader(open(class_filename), delimiter="\t"):
        m = fusion_pbid.match(r["isoform"])
        if m is None:
            logger.error(
                "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!")
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), m.group(2)
        d[gene_index][isoform_index] = r
        orf_dict[r["isoform"]] = r["ORF_seq"]

    # get sequences
    seq_dict = {
        r.id.split("|")[0]: r.seq
        for r in SeqIO.parse(open(f"{fusion_prefix}.rep.fa"), "fasta")
    }

    # get count information
    count_d = defaultdict(lambda: "NA")
    count_filename = f"{fusion_prefix}.abundance.txt"
    if Path(count_filename).exists():
        for r in DictReader(open(count_filename), delimiter="\t"):
            count_d[r["pbid"]] = int(r["count_fl"])

    if total_fl_count is None:
        logger.info(
            "Total FL count not given --- using the sum FL count from fusions only instead."
        )
        total_fl_count = sum(count_d.values())

    # get breakpoint information
    gff_d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    if cds_gff_filename is None:
        gff_filename = f"{fusion_prefix}.gff"
    else:
        gff_filename = cds_gff_filename

    for r in collapseGFFReader(gff_filename):
        m = fusion_pbid.match(r.seqid)
        if m is None:
            logger.error(
                f"ERROR: fusion pbid in {gff_filename} must follow format `PBfusion.X.Y`. Abort!"
            )
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), int(m.group(2))
        gff_d[gene_index][isoform_index] = r
        if r.strand not in ("+", "-"):
            logger.error(
                f"ERROR: fusion {r.seqid} did not specify strand in {gff_filename}! Abort!"
            )
            sys.exit(-1)

    fields2 = list(global_info.keys()) + FIELDS
    with open(f"{fusion_prefix}.annotated.txt",
              "w") as f, open(f"{fusion_prefix}.annotated_ignored.txt",
                              "w") as f_bad:
        writer = DictWriter(f, fields2, delimiter=",")
        writer.writeheader()
        writer_bad = DictWriter(f_bad, fields2, delimiter=",")
        writer_bad.writeheader()

        for gene_index, iso_dict in d.items():
            iso_dict = list(
                iso_dict.items())  # (isoform index, classification record)
            iso_dict.sort(key=lambda x: x[0])
            has_novel = any(r["associated_gene"].startswith("novelGene")
                            or r["associated_gene"] == ""
                            for junk, r in iso_dict)
            pbid = f"PBfusion.{str(gene_index)}"

            gff_info = list(gff_d[gene_index].items())
            gff_info.sort(key=lambda x: x[0])

            rec1 = gff_info[0][1]
            rec2 = gff_info[-1][1]
            (
                left_breakpoint,
                left_seq,
                right_breakpoint,
                right_seq,
            ) = get_breakpoint_n_seq(rec1, rec2, genome_dict)
            left_exon_count = len(rec1.ref_exons)
            right_exon_count = len(rec2.ref_exons)
            gene1 = iso_dict[0][1]["associated_gene"]
            gene2 = iso_dict[-1][1]["associated_gene"]

            if cds_gff_filename is not None:
                left_cds_exon_count = len(rec1.cds_exons)
                right_cds_exon_count = len(rec2.cds_exons)
            else:
                left_cds_exon_count = "NA"
                right_cds_exon_count = "NA"

            left_orf, right_orf = "NA", "NA"
            if orf_dict is not None:
                seqid1 = gff_info[0][1].seqid
                seqid2 = gff_info[-1][1].seqid
                left_orf = orf_dict[seqid1]
                right_orf = orf_dict[seqid2]

            info = {
                "UniqueID":
                pbid,
                "FusionName":
                "--".join([_r["associated_gene"]
                           for (_index, _r) in iso_dict]),
                "LeftGeneName":
                gene1,
                "LeftGeneID":
                gene_to_id[gene1] if gene1 in gene_to_id else "NA",
                "LeftBreakpoint":
                left_breakpoint,
                "LeftFlankingSequence":
                left_seq,
                "RightGeneName":
                gene2,
                "RightGeneID":
                gene_to_id[gene2] if gene2 in gene_to_id else "NA",
                "RightBreakpoint":
                right_breakpoint,
                "RightFlankingSequence":
                right_seq,
                "JunctionSupport":
                "NA",
                "SpanningReads":
                count_d[pbid],
                "ReadCountScore":
                (count_d[pbid] * (10**6) /
                 total_fl_count) if count_d[pbid] != "NA" else "NA",
                "Sequence":
                seq_dict[pbid],
                "LeftORF":
                left_orf,
                "RightORF":
                right_orf,
                "LeftExonCount":
                left_exon_count,
                "RightExonCount":
                right_exon_count,
                "LeftCDSExonCount":
                left_cds_exon_count,
                "RightCDSExonCount":
                right_cds_exon_count,
                "Comments":
                "PASS",
            }
            info.update(global_info)

            left_chr, left_break, left_strand = left_breakpoint.split(":")
            right_chr, right_break, right_strand = right_breakpoint.split(":")

            if has_novel:
                info["Comments"] = "FAIL:NovelGene"
            elif gene1 == gene2:
                info["Comments"] = "FAIL:SameGene"
            elif info["SpanningReads"] != "NA" and info[
                    "SpanningReads"] < min_fl_count:
                info["Comments"] = "FAIL:TooFewFLReads"
            elif not include_Mt_genes and (gene1.startswith("MT-")
                                           or gene2.startswith("MT-")):
                info["Comments"] = "FAIL:MtGenes"
            elif (left_chr == right_chr
                  and abs(int(left_break) - int(right_break)) / 1000 <=
                  min_breakpoint_dist_kb):
                info["Comments"] = "FAIL:BreakpointTooClose"

            if info["Comments"].startswith("FAIL:"):
                writer_bad.writerow(info)
            else:
                writer.writerow(info)
Exemplo n.º 21
0
def clip_out(
    bam_filename: str,
    umi_len: int,
    bc_len: int,
    output_prefix: str,
    UMI_type: umi_types,
    shortread_bc: Optional[Dict[str, str]] = None,
    tso_len: int = 0,
    g5_clip_seq: Optional[str] = None,
) -> None:
    """
    :param bam_filename: BAM of post-LIMA (primer-trimmed) CCS sequences
    :param UMI_type: either 'A3' or 'G5' or 'G5-10X'
    :param shortread_bc: a dict of barcode -> "Y|N" for top-ranked. If given, came from short read data.

    --------
    G5-10X
    --------
    5' primer -- BC --- UMI -- TSO --- GGG --- transcript --- polyA

    --------
    G5-clip
    assumes input is like below, where the 5'/3' primer already removed by lima
    Here, we will only clip out the UMI, and write out the rest of the sequence, keeping the RT + transcript
    There is no assumption about the polyA tail existing or not
    --------
    5' primer -- UMI -- [RT primer] --- transcript --- 3' primer
    """
    if shortread_bc is None:
        shortread_bc = dict()

    if UMI_type not in ("A3", "G5", "G5-10X", "G5-clip"):
        raise ValueError(
            f"UMI is of the wrong type.  Got {UMI_type} Must be one of 'A3', 'G5', 'G5-10X', 'G5-clip'"
        )

    umi_bc_len = umi_len + bc_len

    if UMI_type == "G5-clip":
        try:
            import parasail
        except ImportError:
            logger.error("need parasail library for G5-clip mode! Abort!")
            sys.exit(-1)
        para_mat = parasail.matrix_create("ACGT", 2, -5)
        para_search_len = umi_len + len(g5_clip_seq) + 10

    FIELDS = [
        "id",
        "clip_len",
        "extra",
        "UMI",
        "BC",
        "BC_rev",
        "BC_match",
        "BC_top_rank",
    ]
    if tso_len > 0:
        FIELDS += ["TSO"]

    with pysam.AlignmentFile(bam_filename, "rb", check_sq=False) as reader:
        with open(f"{output_prefix}.trimmed.csv",
                  "w") as f1, pysam.AlignmentFile(
                      f"{output_prefix}.trimmed.bam",
                      "wb",
                      header=reader.header) as f2:
            writer1 = DictWriter(
                f1,
                FIELDS,
                delimiter="\t",
                dialect="unix",
            )
            writer1.writeheader()

            for r in reader:
                d = r.to_dict()

                # is_rev_strand = r.flag >> 4 & 1
                if r.flag >> 4 & 1:
                    d["seq"] = str(Seq(r.seq).reverse_complement())
                    d["qual"] = r.qual[::-1]
                    new_tags = []
                    for tag in d["tags"]:
                        if (tag.startswith("dq:i:") or tag.startswith("iq:i:")
                                or tag.startswith("sq:i:")):
                            tag = tag[:5] + tag[::-1][:-5]
                        new_tags.append(tag)
                    d["tags"] = new_tags
                    d["flag"] = "4"  # convert it back to not being rev complemented

                if UMI_type == "A3":
                    A_start, A_end = find_Aend(d["seq"])
                    if A_end > 0:
                        seq2 = d["seq"][
                            A_end:]  # should be just UMI + BC, unless UMI started with 'A's

                        diff = len(seq2) - umi_bc_len
                        if diff < 0:  # UMI may have started with 'A's
                            seq2 = d["seq"][A_end + diff:]

                        seq_extra = "NA"
                        if diff > 0:
                            seq_extra = seq2[:diff]

                        if bc_len == 0:
                            seq_bc = ""
                        else:
                            seq_bc = seq2[-bc_len:]

                        if umi_len == 0:
                            seq_umi = ""
                        else:
                            if bc_len == 0:
                                seq_umi = seq2[-umi_len:]
                            else:
                                seq_umi = seq2[-(bc_len + umi_len):-bc_len]

                        # reverse complement BC because it's always listed in rev comp in short read data
                        seq_bc_rev = str(Seq(seq_bc).reverse_complement())

                        match = "Y" if seq_bc_rev in shortread_bc else "N"
                        match_top = ("Y" if
                                     (match == "Y"
                                      and shortread_bc[seq_bc_rev] == "Y") else
                                     "N")

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq_umi,
                            "BC": seq_bc,
                            "BC_rev": seq_bc_rev,
                            "BC_match": match,
                            "BC_top_rank": match_top,
                        }
                        writer1.writerow(rec)

                        # subset the sequence to include only the polyAs
                        d["seq"] = d["seq"][:A_end]
                        d["qual"] = d["qual"][:A_end]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:A_end + 5]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5":
                    G_start, G_end = find_Gstart(d["seq"])
                    if G_start > 0:
                        seq2 = d["seq"][:G_start]  # should be just UMI

                        diff = len(seq2) - umi_len
                        if diff < 0:  # UMI may have ended with Gs
                            seq2 = d["seq"][:G_start - diff]

                        seq_extra = "NA"
                        if diff > 0:
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq2,
                            "BC":
                            "NA",  # Brendan's current design has only UMI, no BC
                            "BC_rev": "NA",
                            "BC_match": "NA",
                            "BC_top_rank": "NA",
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMIs and "G"s
                        d["seq"] = d["seq"][G_end:]
                        d["qual"] = d["qual"][G_end:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + G_end:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5-clip":
                    o1 = parasail.sg_qx_trace(d["seq"][:para_search_len],
                                              g5_clip_seq, 10, 3, para_mat)

                    #  'tags': ['bx:B:i,22,20',
                    #   ...
                    #   'qe:i:2835',
                    #   'bc:B:S,0,1',
                    #   'bl:Z:CCCGCGTGGCCTCCTGAATTAT',
                    #   'bt:Z:CATTGCCACTGTCTTCTGCT',
                    #   'RG:Z:70de1488/0--1']}
                    c_num, c_type = next(
                        iter_cigar_string(str(o1.cigar.decode, "utf-8")))
                    if c_type == "I":  # this is the (extra) + UMI
                        seq2 = d["seq"][:c_num]
                        seq_extra = "NA"
                        diff = len(seq2) - umi_len
                        if diff < 0:  # we need to get a few more bases from the primers
                            tag_dict = dict(x.split(":", 1) for x in d["tags"])
                            try:
                                if tag_dict["bc"] == "B:S,0,1":  # + strand
                                    assert tag_dict["bl"].startswith("Z:")
                                    Fseq = tag_dict["bl"][
                                        2:]  # trimming away the Z:
                                elif tag_dict["bc"] == "B:S,1,0":  # - strand
                                    assert tag_dict["bt"].startswith("Z:")
                                    Fseq = str(
                                        Seq(tag_dict["bt"]
                                            [2:]).reverse_complement())
                                seq2 = (
                                    Fseq[diff:] + seq2
                                )  # rescue bases from the trimmed F primer
                            except KeyError:
                                pass  # just silently not do anything and output the shorter UMI
                                # print("WARNING: older version of lima output, lacking 'bc' tag. Ignoring read {0}...".format(r.qname))
                        elif diff > 0:  # there's extras
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq2,
                            "BC":
                            "NA",  # Brendan's current design has only UMI, no BC
                            "BC_rev": "NA",
                            "BC_match": "NA",
                            "BC_top_rank": "NA",
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMI (but keep the G5 clip seq)
                        d["seq"] = d["seq"][c_num:]
                        d["qual"] = d["qual"][c_num:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + c_num:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5-10X":
                    # need to first invert the sequence so polyA is at the end
                    d["seq"] = str(Seq(d["seq"]).reverse_complement())
                    d["qual"] = d["qual"][::-1]
                    # now it is BC -- UMI -- TSO -- GGG -- transcript -- polyA
                    umi_bc_tso_len = bc_len + umi_len + tso_len
                    G_start, G_end = find_Gstart(
                        d["seq"][umi_bc_tso_len:umi_bc_tso_len + 10])

                    # pdb.set_trace()

                    if G_start >= 0:
                        G_start += umi_bc_tso_len
                        G_end += umi_bc_tso_len

                        seq2 = d["seq"][:G_start]  # this is BC - UMI - TSO
                        seq_tso = seq2[-tso_len:] + d["seq"][G_start:G_end]

                        diff = len(seq2) - umi_bc_tso_len
                        if diff > 0:  # beginning may have included untrimmed primers
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]
                            seq_bc = seq2[:bc_len]
                            seq_umi = seq2[bc_len:umi_bc_len]
                        elif diff == 0:
                            seq_extra = "NA"
                            seq_bc = seq2[:bc_len]
                            seq_umi = seq2[bc_len:umi_bc_len]
                        elif (
                                diff < 0
                        ):  # we may have accidentally trimmed away some bases for BC, can't do anything
                            seq_extra = "NA"
                            seq_bc = seq2[:bc_len + diff]
                            seq_umi = seq2[bc_len + diff:umi_bc_len + diff]

                        # reverse complement BC because it's always listed in rev comp in short read data
                        seq_bc_rev = str(Seq(seq_bc).reverse_complement())
                        match = "Y" if seq_bc_rev in shortread_bc else "N"
                        match_top = ("Y" if
                                     (match == "Y"
                                      and shortread_bc[seq_bc_rev] == "Y") else
                                     "N")

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2) + (G_end - G_start),
                            "extra": seq_extra,
                            "UMI": seq_umi,
                            "BC": seq_bc,
                            "TSO": seq_tso,
                            "BC_rev": seq_bc_rev,
                            "BC_match": match,
                            "BC_top_rank": match_top,
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMIs and "G"s
                        d["seq"] = d["seq"][G_end:]
                        d["qual"] = d["qual"][G_end:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + G_end:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
Exemplo n.º 22
0
def filter_by_count(
    input_prefix: str,
    output_prefix: str,
    min_count: int,
    dun_use_group_count: bool = False,
) -> None:

    group_filename = f"{input_prefix}.group.txt"
    count_filename = f"{input_prefix}.abundance.txt"
    gff_filename = f"{input_prefix}.gff"
    rep_filenames = [
        (f"{input_prefix}.rep.fq", "fastq"),
        (f"{input_prefix}.rep.fastq", "fastq"),
        (f"{input_prefix}.rep.fa", "fasta"),
        (f"{input_prefix}.rep.fasta", "fasta"),
    ]

    rep_filename = None
    rep_type = None
    for x, feature in rep_filenames:
        if os.path.exists(x):
            rep_filename = x
            rep_type = feature

    if rep_filename is None:
        logger.error(
            f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!"
        )
        sys.exit(-1)

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        for line in open(group_filename):
            # ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split("\t")
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(",")
            for m in members:
                i = m.find("|")
                if i > 0:
                    tmp = m.split("|")[1].split("/")[1]  # ex: tmp = f30p16
                else:
                    tmp = m.split("/")[1]
                fl_count, p_count = tmp.split("p")
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)

    # read abundance first
    with open(count_filename) as f:
        count_header = ""
        while True:
            cur_pos = f.tell()
            line = f.readline()
            if not line.startswith("#"):
                f.seek(cur_pos)
                break
            else:
                count_header += line
        d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")}
        for k, v in d.items():
            print(k, v)

    # group_max_count_p NOT used for now
    good = [
        x
        for x in d
        if int(d[x]["count_fl"]) >= min_count
        and (dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    with open(f"{output_prefix}.gff", "w") as f:
        for r in GFF.collapseGFFReader(gff_filename):
            if r.seqid in good:
                GFF.write_collapseGFF_format(f, r)

    # write output rep.fq
    with open(
        f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w"
    ) as f:
        for r in SeqIO.parse(open(rep_filename), rep_type):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, rep_type)

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(
        f"Output written to: {output_prefix}.gff\n"
        f"Output written to: {rep_filename}\n"
        f"Output written to: {output_prefix}.abundance.txt"
    )
def demux_for_subsamping(
    class_filename,
    fasta_filename,
    demux_count_file,
    output_prefix,
    out_group_dict,
    ignore_novel,
):
    # read SQANTI classification to get known gene/transcript name
    d = {}  # pbid --> record
    for r in DictReader(open(class_filename), delimiter="\t"):
        d[r["isoform"]] = r

    # get read lengths
    lens = {}  # pbid -> length
    for r in SeqIO.parse(open(fasta_filename), "fasta"):
        lens[r.id] = len(r.seq)

    writers = {}
    handles = {}
    out_groups = set(out_group_dict.values())
    for g in out_groups:
        handles[g] = open(
            f"{output_prefix}_{g}_only.{'ignore_novel' if ignore_novel else 'use_novel'}.for_subsampling.txt",
            "w",
        )
        writers[g] = DictWriter(handles[g], FIELDNAMES, delimiter="\t")
        writers[g].writeheader()

    reader = DictReader(open(demux_count_file), delimiter=",")
    for r in reader:
        if r["id"] not in d:
            logger.info(
                f"WARNING: skipping {r['id']} because not in {class_filename}",
            )
            continue

        m = pbid_rex.match(r["id"])
        if m is None:
            logger.error(
                f"ERROR: unable to parse ID {r['id']}. Expected format PB.X.Y!",
            )
            sys.exit(-1)

        newrec = {
            "pbid": r["id"],
            "pbgene": m.group(1),
            "length": lens[r["id"]]
        }

        gene = d[r["id"]]["associated_gene"]
        trans = d[r["id"]]["associated_transcript"]
        if gene.startswith("novel") and ignore_novel:
            gene = "NA"
        if trans.startswith("novel"):
            if ignore_novel:
                trans = "NA"
            else:
                trans += r[
                    "id"]  # add an unique identified to make this "novel" refgene unique
        newrec["refgene"] = gene
        newrec["refisoform"] = trans

        group_counts = Counter()
        for b, g in out_group_dict.items():
            group_counts[g] += int(r[b])

        for g in out_groups:
            newrec["fl_count"] = group_counts[g]
            writers[g].writerow(newrec)

    for h in handles.values():
        h.close()
def make_file_for_subsample(
    input_prefix: str,
    output_prefix: str,
    demux_file=None,
    matchAnnot_parsed=None,
    sqanti_class=None,
    include_single_exons=False,
) -> None:
    """
    Two files must exist: .abundance.txt and .rep.fq so we can make the length
    """
    count_filename = f"{input_prefix}.abundance.txt"

    rep_filenames = [
        (f"{input_prefix}.rep.fq", "fastq"),
        (f"{input_prefix}.rep.fastq", "fastq"),
        (f"{input_prefix}.rep.fa", "fasta"),
        (f"{input_prefix}.rep.fasta", "fasta"),
    ]

    rep_filename = None
    rep_type = None
    for x, feature in rep_filenames:
        if Path(x).exists():
            rep_filename = x
            rep_type = feature

    if rep_filename is None:
        logger.error(
            "Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!"
        )
        sys.exit(-1)

    if not include_single_exons:
        from cupcake.sequence.GFF import collapseGFFReader

        gff_filename = f"{input_prefix}.gff"
        logger.info(f"Reading {gff_filename} to exclude single exons...")
        # good_ids = []
        good_ids = [
            r.seqid for r in collapseGFFReader(gff_filename) if len(r.ref_exons) >= 2
        ]
        # for r in collapseGFFReader(gff_filename):
        #     if len(r.ref_exons) >= 2:
        #         good_ids.append(r.seqid)
    else:
        good_ids = []

    if demux_file is None and not Path(count_filename).exists():
        logger.error(f"Cannot find {count_filename}. Abort!")
        sys.exit(-1)

    if matchAnnot_parsed is not None and not Path(matchAnnot_parsed).exists():
        logger.error(f"Cannot find {matchAnnot_parsed}. Abort!")
        sys.exit(-1)

    if sqanti_class is not None and not Path(sqanti_class).exists():
        logger.error(f"Cannot find {sqanti_class}. Abort!")
        sys.exit(-1)

    if matchAnnot_parsed is not None:
        with open(matchAnnot_parsed) as ma:
            match_dict = {r["pbid"]: r for r in DictReader(ma, delimiter="\t")}
        for k in match_dict:
            match_dict[k]["category"] = match_dict[k]["score"]
    elif sqanti_class is not None:
        logger.info(f"Reading {sqanti_class} to get gene/isoform assignment...")
        match_dict = {}
        with open(sqanti_class) as sc:
            for r in DictReader(sc, delimiter="\t"):
                if r["associated_transcript"] == "novel":
                    refisoform = f"novel_{r['isoform']}"
                else:
                    refisoform = r["associated_transcript"]
                match_dict[r["isoform"]] = {
                    "refgene": r["associated_gene"],
                    "refisoform": refisoform,
                    "category": r["structural_category"],
                }
    else:
        match_dict = None
    with open(rep_filename) as rf:
        seqlen_dict = {
            r.id.split("|")[0]: len(r.seq) for r in SeqIO.parse(rf, rep_type)
        }

    to_write = {}
    if demux_file is None:
        to_write["all"] = {}
        with open(count_filename) as f:
            while True:
                cur = f.tell()
                if not f.readline().startswith("#"):
                    f.seek(cur)
                    break
            for r in DictReader(f, delimiter="\t"):
                if r["pbid"] in good_ids or include_single_exons:
                    to_write["all"][r["pbid"]] = r["count_fl"]
    else:
        d, samples = read_demux_fl_count_file(demux_file)
        for s in samples:
            to_write[s] = {}
        for pbid, d2 in d.items():
            for s in samples:
                if pbid in good_ids or include_single_exons:
                    to_write[s][pbid] = d2[s]

    for sample in to_write:
        with Path(f"{output_prefix}.{sample}.txt").open("a+") as h:
            if matchAnnot_parsed is None and sqanti_class is None:
                h.write("pbid\tpbgene\tlength\tfl_count\n")
            else:
                h.write(
                    "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n"
                )
            for pbid in to_write[sample]:
                if matchAnnot_parsed is not None or sqanti_class is not None:
                    if pbid not in match_dict:
                        logger.warning(
                            f"Ignoring {pbid} because not in annotation (SQANTI/MatchAnnot) file."
                        )
                        continue
                    m = match_dict[pbid]
                    h.write(f"{pbid}\t{pbid.split('.')[1]}\t{seqlen_dict[pbid]}\t")
                    h.write(f'{m["refisoform"]}\t{m["refgene"]}\t{m["category"]}\t')
                else:
                    h.write(f'{pbid}\t{pbid.split(".")[1]}\t{seqlen_dict[pbid]}\t')
                h.write(f"{to_write[sample][pbid]}\n")
            logger.info(
                f"Output written to {Path(f'{output_prefix}.{sample}.txt').resolve()}."
            )
Exemplo n.º 25
0
def collate_gene_info(
    group_filename,
    csv_filename,
    class_filename,
    output_filename,
    ontarget_filename=None,
    dedup_ORF_prefix=None,
    no_extra_base=False,
    is_clustered=False,
):
    """
    <id>, <pbid>, <length>, <transcript>, <gene>, <category>, <ontarget Y|N|NA>, <ORFgroup NA|NoORF|groupID>, <UMI>, <BC>
    """
    FIELDS = [
        "id",
        "pbid",
        "length",
        "transcript",
        "gene",
        "category",
        "ontarget",
        "ORFgroup",
        "UMI",
        "UMIrev",
        "BC",
        "BCrev",
    ]

    group_info = read_group_info(group_filename)
    umi_bc_info = {
        r["id"]: r
        for r in DictReader(open(csv_filename), delimiter="\t")
    }
    sqanti_info = {
        r["isoform"]: r
        for r in DictReader(open(class_filename), delimiter="\t")
    }
    if ontarget_filename is not None:
        ontarget_info = {
            r["read_id"]: r
            for r in DictReader(open(ontarget_filename), delimiter="\t")
        }

    if dedup_ORF_prefix is not None:
        dedup_ORF_info = (
            {}
        )  # seqid --> which group they belong to (ex: PB.1.2 --> ORFgroup_PB.1_1)
        for line in open(f"{dedup_ORF_prefix}.group.txt"):
            group_id, members = line.strip().split("\t")
            for pbid in members.split(","):
                dedup_ORF_info[pbid] = group_id

    f = open(output_filename, "w")
    writer = DictWriter(f, FIELDS, delimiter="\t")
    writer.writeheader()

    for ccs_id, pbid in group_info.items():
        if pbid not in sqanti_info:
            logger.error(f"ignoring ID {pbid} cuz not in classification file.")
            continue

        if is_clustered:
            # id: 1-ATCGAATGT-GCTTCTTTCACCTATCGATGATGGCTCAT-m64015_200531_015713/110297924/ccs
            _index, _umi, _bc, _ccs_id = ccs_id.split("-")
            ccs_id = _ccs_id

        if no_extra_base and (not is_clustered
                              and umi_bc_info[ccs_id]["extra"] != "NA"):
            logger.info(f"ignoring ID {pbid} cuz extra bases.")
            continue
        rec = {"id": ccs_id, "pbid": pbid}
        rec["length"] = sqanti_info[pbid]["length"]
        rec["category"] = sqanti_info[pbid]["structural_category"]
        rec["transcript"] = sqanti_info[pbid]["associated_transcript"]
        rec["gene"] = sqanti_info[pbid]["associated_gene"]

        if is_clustered:
            rec["UMI"] = _umi
            rec["BC"] = _bc
        else:
            rec["UMI"] = umi_bc_info[ccs_id]["UMI"]
            rec["BC"] = umi_bc_info[ccs_id]["BC"]
        rec["UMIrev"] = Seq(rec["UMI"]).reverse_complement()
        rec["BCrev"] = Seq(rec["BC"]).reverse_complement()
        if ontarget_filename is None:
            rec["ontarget"] = "NA"
        else:
            rec["ontarget"] = "Y" if ontarget_info[pbid]["genes"] != "" else "N"
        if dedup_ORF_prefix is None:
            rec["ORFgroup"] = "NA"
        else:
            if pbid not in dedup_ORF_info:
                rec["ORFgroup"] = "NoORF"
            else:
                rec["ORFgroup"] = dedup_ORF_info[pbid]

        writer.writerow(rec)

    f.close()