Exemplos de parse_ds_filename em Python, exemplos de pbtranscript.io.parse_ds_filename em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: FilteringUtils.py Projeto: natechols/pbtranscript

def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename,
                                  out_abundance_filename, out_gff_filename, out_rep_filename,
                                  good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError("Format of input %s and output %s must be either FASTA or FASTQ." %
                         (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: CollapseIsoforms.py Projeto: ylipacbio/pbtranscript

    def __init__(self, isoform_filename, sam_filename, output_prefix,
                 min_aln_coverage, min_aln_identity, min_flnc_coverage,
                 max_fuzzy_junction, allow_extra_5exon, skip_5_exon_alt):
        """
        Parameters:
          isoform_filename -- input file containing isoforms, as fastq|fasta|contigset
          sam_filename -- input sam file produced by mapping fastq_filename to reference and sorted.
          #collapsed_isoform_filename -- file to output collapsed isoforms as fasta|fastq|contigset
          min_aln_coverage -- min coverage over reference to collapse a group of isoforms
          min_aln_identity -- min identity aligning to reference to collapse a group of isoforms
          min_flnc_coverage -- min supportive flnc reads to not ignore an isoform
                               Must be 1 when collapsing consensus isoforms, which is the case in production isoseq.
                               Can be >= 1 only when directly collapsing FLNC reads.
          max_fuzzy_junction -- max edit distance between fuzzy-matching exons
          allow_extra_5exon -- whether or not to allow shorter 5' exons
          skip_5_exon_alt -- whether or not to skip alternative 5' exons
        """
        self.suffix = parse_ds_filename(isoform_filename)[1]
        super(CollapseIsoformsRunner, self).__init__(prefix=output_prefix,
                                                     allow_extra_5exon=allow_extra_5exon)

        self.isoform_filename = isoform_filename # input, uncollapsed fa|fq|ds
        self.sam_filename = sam_filename # input, sorted, gmap sam
        #self.collapsed_isoform_filename = collapsed_isoform_filename # output, collapsed, fa|fq|ds

        self.min_aln_coverage = float(min_aln_coverage)
        self.min_aln_identity = float(min_aln_identity)
        self.min_flnc_coverage = int(min_flnc_coverage)
        self.max_fuzzy_junction = int(max_fuzzy_junction)
        self.allow_extra_5exon = bool(allow_extra_5exon)
        self.skip_5_exon_alt = bool(skip_5_exon_alt)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: CollapseIsoforms.py Projeto: wenmm/pbtranscript

    def __init__(self, isoform_filename, sam_filename, output_prefix,
                 min_aln_coverage, min_aln_identity, min_flnc_coverage,
                 max_fuzzy_junction, allow_extra_5exon, skip_5_exon_alt):
        """
        Parameters:
          isoform_filename -- input file containing isoforms, as fastq|fasta|contigset
          sam_filename -- input sam file produced by mapping fastq_filename to reference and sorted.
          #collapsed_isoform_filename -- file to output collapsed isoforms as fasta|fastq|contigset
          min_aln_coverage -- min coverage over reference to collapse a group of isoforms
          min_aln_identity -- min identity aligning to reference to collapse a group of isoforms
          min_flnc_coverage -- min supportive flnc reads to not ignore an isoform
                               Must be 1 when collapsing consensus isoforms, which is the case in production isoseq.
                               Can be >= 1 only when directly collapsing FLNC reads.
          max_fuzzy_junction -- max edit distance between fuzzy-matching exons
          allow_extra_5exon -- whether or not to allow shorter 5' exons
          skip_5_exon_alt -- whether or not to skip alternative 5' exons
        """
        self.suffix = parse_ds_filename(isoform_filename)[1]
        super(CollapseIsoformsRunner,
              self).__init__(prefix=output_prefix,
                             allow_extra_5exon=allow_extra_5exon)

        self.isoform_filename = isoform_filename  # input, uncollapsed fa|fq|ds
        self.sam_filename = sam_filename  # input, sorted, gmap sam
        #self.collapsed_isoform_filename = collapsed_isoform_filename # output, collapsed, fa|fq|ds

        self.min_aln_coverage = float(min_aln_coverage)
        self.min_aln_identity = float(min_aln_identity)
        self.min_flnc_coverage = int(min_flnc_coverage)
        self.max_fuzzy_junction = int(max_fuzzy_junction)
        self.allow_extra_5exon = bool(allow_extra_5exon)
        self.skip_5_exon_alt = bool(skip_5_exon_alt)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: FilteringUtils.py Projeto: lpp1985/lpp_Script

def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename,
                                  in_rep_filename, out_abundance_filename,
                                  out_gff_filename, out_rep_filename, good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input %s and output %s must be either FASTA or FASTQ." %
            (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: collapse_mapped_isoforms.py Projeto: lpp1985/lpp_Script

def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml": # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" % args.collapsed_isoforms)
    return 0

Exemplo n.º 6

0

Exibir arquivo

Arquivo: collapse_mapped_isoforms.py Projeto: wenmm/pbtranscript

def args_runner(args):
    """Run given input args"""
    c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms,
                               sam_filename=args.sam_filename,
                               output_prefix=args.output_prefix,
                               min_aln_coverage=args.min_aln_coverage,
                               min_aln_identity=args.min_aln_identity,
                               min_flnc_coverage=args.min_flnc_coverage,
                               max_fuzzy_junction=args.max_fuzzy_junction,
                               allow_extra_5exon=args.allow_extra_5exon,
                               skip_5_exon_alt=args.skip_5_exon_alt)
    c.run()

    if args.collapsed_isoforms is not None:
        suffix = parse_ds_filename(args.collapsed_isoforms)[1]
        if op.exists(c.rep_fn(suffix)):
            ln(c.rep_fn(suffix), args.collapsed_isoforms)
        else:
            if suffix == ".contigset.xml":  # make contigset from fasta
                as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms)
            else:
                raise IOError("Could not make collapsed isoform file %s" %
                              args.collapsed_isoforms)
    return 0

Exemplo n.º 7

0

Exibir arquivo

Arquivo: CollapsingUtils.py Projeto: palfalvi/pbtranscript

def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: post_mapping_to_genome.py Projeto: wenmm/pbtranscript

def post_mapping_to_genome_runner(
        in_isoforms,
        in_sam,
        in_pickle,
        out_isoforms,
        out_gff,
        out_abundance,
        out_group,
        out_read_stat,
        min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
        min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
        min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
        max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
        allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
        skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
        min_count=fci.Constants.MIN_COUNT_DEFAULT,
        to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    log.info('args: {!r}'.format(locals()))
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError(
            "Format of input and output isoforms %s, %s must be the same." %
            (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input and output isoforms %s, %s must be FASTA or FASTQ."
            % (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn,
                     pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn,
                    in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn,
                    in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [
        (fff.filtered_rep_fn(out_suffix), out_isoforms),  # rep isoforms
        (fff.filtered_gff_fn, out_gff),  # gff annotation
        (fff.filtered_abundance_fn, out_abundance),  # abundance info
        (fff.group_fn, out_group),  # groups
        (fff.read_stat_fn, out_read_stat)
    ]  # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info(
        "Collapsed and filtered isoform sequences written to %s",
        realpath(out_isoforms) if out_isoforms is not None else realpath(
            fff.filtered_rep_fn(out_suffix)))
    logging.info(
        "Collapsed and filtered isoform annotations written to %s",
        realpath(out_gff)
        if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info(
        "Collapsed and filtered isoform abundance info written to %s",
        realpath(out_abundance)
        if out_abundance is not None else realpath(fff.filtered_abundance_fn))
    logging.info(
        "Collapsed isoform groups written to %s",
        realpath(out_group)
        if out_group is not None else realpath(fff.group_fn))
    logging.info(
        "Read status of FL and nFL reads written to %s",
        realpath(out_read_stat)
        if out_read_stat is not None else realpath(fff.read_stat_fn))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: CollapsingUtils.py Projeto: lpp1985/lpp_Script

def pick_rep(isoform_filename, gff_filename,
             group_filename, output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError("%s must contain either indexed FASTA files or " % isoform_filename +
                              "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." % isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml": # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: post_mapping_to_genome.py Projeto: natechols/pbtranscript

def post_mapping_to_genome_runner(in_isoforms, in_sam, in_pickle,
                                  out_isoforms, out_gff, out_abundance, out_group, out_read_stat,
                                  min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
                                  min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
                                  min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
                                  max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
                                  allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
                                  skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
                                  min_count=fci.Constants.MIN_COUNT_DEFAULT,
                                  to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError("Format of input and output isoforms %s, %s must be the same." %
                         (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError("Format of input and output isoforms %s, %s must be FASTA or FASTQ." %
                         (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [(fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms
                (fff.filtered_gff_fn, out_gff), # gff annotation
                (fff.filtered_abundance_fn, out_abundance), # abundance info
                (fff.group_fn, out_group), # groups
                (fff.read_stat_fn, out_read_stat)] # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info("Collapsed and filtered isoform sequences written to %s",
                 realpath(out_isoforms) if out_isoforms is not None else
                 realpath(fff.filtered_rep_fn(out_suffix)))
    logging.info("Collapsed and filtered isoform annotations written to %s",
                 realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info("Collapsed and filtered isoform abundance info written to %s",
                 realpath(out_abundance) if out_abundance is not None else
                 realpath(fff.filtered_abundance_fn))
    logging.info("Collapsed isoform groups written to %s",
                 realpath(out_group) if out_group is not None else realpath(fff.group_fn))
    logging.info("Read status of FL and nFL reads written to %s",
                 realpath(out_read_stat) if out_read_stat is not None else
                 realpath(fff.read_stat_fn))