예제 #1
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
예제 #2
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
예제 #3
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." % ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".format(
                        s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(
                    bas_handler=bas_handlers[bas_file],
                    parsed_read_name=parsed_read_name,
                    qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()
예제 #4
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(
                f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
예제 #5
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." %
                      ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".
                                  format(s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                              parsed_read_name=parsed_read_name,
                                              qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError("Sequence and QVs of {r} should be the same!".
                                 format(r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()
예제 #6
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
예제 #7
0
    def index_input_subreads(self):
        """Index input subreads in self.fasta_fofn or self.bas_fofn.
        """
        if guess_file_format(self.bas_fofn) == FILE_FORMATS.BAM:
            msg = "Indexing files in %s, please wait." % self.bas_fofn
            self.add_log(msg)
            d = BamCollection(self.bas_fofn)
        else:
            msg = "Indexing files in %s, please wait." % self.fasta_fofn
            self.add_log(msg)
            d = MetaSubreadFastaReader(get_files_from_file_or_fofn(self.fasta_fofn))

        self.add_log("File indexing done.")
        return d
예제 #8
0
    def index_input_subreads(self):
        """Index input subreads in self.fasta_fofn or self.bas_fofn.
        """
        if guess_file_format(self.bas_fofn) == FILE_FORMATS.BAM:
            msg = "Indexing files in %s, please wait." % self.bas_fofn
            self.add_log(msg)
            d = BamCollection(self.bas_fofn)
        else:
            msg = "Indexing files in %s, please wait." % self.fasta_fofn
            self.add_log(msg)
            d = MetaSubreadFastaReader(get_files_from_file_or_fofn(self.fasta_fofn))

        self.add_log("File indexing done.")
        return d
예제 #9
0
    def __init__(self, *args):
        if len(args) == 1:
            args = get_files_from_file_or_fofn(args[0])
        self._dataset = openDataFile(*args)
        # Implementation notes: find all the bam files, and group
        # them together by movieName
        self._header = BamHeader(ignore_pg=True)

        for bam in self._dataset.resourceReaders():
            if not isinstance(bam, IndexedBamReader):
                raise ValueError("%s in %s must have pbi index generated",
                                 bam.filename, str(self._dataset))
            self._header.add(bam.peer.header)
            for rg in bam.peer.header["RG"]: #readGroupTable:
                if rg['PL'] != "PACBIO":
                    raise IOError("Input BAM file %s for %s must be PacBio BAM.",
                                  bam.filename, self.__class__.__name__)
            for rg in bam.readGroupTable:
                assert rg.ReadType in ["CCS", "SUBREAD"]
예제 #10
0
    def __init__(self, *args):
        if len(args) == 1:
            args = get_files_from_file_or_fofn(args[0])
        self._dataset = openDataFile(*args)
        # Implementation notes: find all the bam files, and group
        # them together by movieName
        self._header = BamHeader(ignore_pg=True)

        for bam in self._dataset.resourceReaders():
            if not isinstance(bam, IndexedBamReader):
                raise ValueError("%s in %s must have pbi index generated",
                                 bam.filename, str(self._dataset))
            self._header.add(bam.peer.header)
            for rg in bam.peer.header["RG"]:  #readGroupTable:
                if rg['PL'] != "PACBIO":
                    raise IOError(
                        "Input BAM file %s for %s must be PacBio BAM.",
                        bam.filename, self.__class__.__name__)
            for rg in bam.readGroupTable:
                assert rg.ReadType in ["CCS", "SUBREAD"]