示例#1
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
    def testAll(self):
        """Test FastaRandomReader.keys() and __getitem__."""
        write_files_to_fofn([self.fa1, self.fa2], self.fofn)
        reader = MetaSubreadFastaReader([self.fa1, self.fa2])
        subread_1 = "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/59/0_5071"
        subread_2 = "m130812_random_random_s1_p0/440/13280_16126"
        zmw_3 = "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70"
        zmw_4 = "m130812_random_random_s1_p0/249"
        r1 = reader[subread_1][0]
        self.assertEqual(r1.name, subread_1)
        self.assertEqual(hashlib.md5(r1.sequence).hexdigest(), "8128261dd851ae285d029618739559e9")

        r2 = reader[subread_2][0]
        self.assertEqual(r2.name, subread_2)
        self.assertEqual(hashlib.md5(r2.sequence).hexdigest(), "451e5798a7f21cce80da27a03a8cb2c7")

        r3, r4 = reader[zmw_3]
        self.assertEqual(r3.name, "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70/0_5538")
        self.assertEqual(r4.name, "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70/5587_5982")
        self.assertEqual(hashlib.md5(r3.sequence).hexdigest(), "4db2f6e35c83dd279a8f71a51ac50445")
        self.assertEqual(hashlib.md5(r4.sequence).hexdigest(), "1c1d080e9362a73ea2074f9a62fbd45e")

        r5 = reader[zmw_4][0]
        self.assertEqual(r5.name, "m130812_random_random_s1_p0/249/0_1339")
        self.assertEqual(hashlib.md5(r5.sequence).hexdigest(), "b20d3723a136aedc2f96f6f498ad3da0")
示例#3
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
示例#4
0
    def testAll(self):
        """Test FastaRandomReader.keys() and __getitem__."""
        write_files_to_fofn([self.fa1, self.fa2], self.fofn)
        reader = MetaSubreadFastaReader([self.fa1, self.fa2])
        subread_1 = "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/59/0_5071"
        subread_2 = "m130812_random_random_s1_p0/440/13280_16126"
        zmw_3 = "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70"
        zmw_4 = "m130812_random_random_s1_p0/249"
        r1 = reader[subread_1][0]
        self.assertEqual(r1.name, subread_1)
        self.assertEqual(
            hashlib.md5(r1.sequence).hexdigest(),
            "8128261dd851ae285d029618739559e9")

        r2 = reader[subread_2][0]
        self.assertEqual(r2.name, subread_2)
        self.assertEqual(
            hashlib.md5(r2.sequence).hexdigest(),
            "451e5798a7f21cce80da27a03a8cb2c7")

        r3, r4 = reader[zmw_3]
        self.assertEqual(
            r3.name,
            "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70/0_5538"
        )
        self.assertEqual(
            r4.name,
            "m130812_185809_42141_c100533960310000001823079711101380_s1_p0/70/5587_5982"
        )
        self.assertEqual(
            hashlib.md5(r3.sequence).hexdigest(),
            "4db2f6e35c83dd279a8f71a51ac50445")
        self.assertEqual(
            hashlib.md5(r4.sequence).hexdigest(),
            "1c1d080e9362a73ea2074f9a62fbd45e")

        r5 = reader[zmw_4][0]
        self.assertEqual(r5.name, "m130812_random_random_s1_p0/249/0_1339")
        self.assertEqual(
            hashlib.md5(r5.sequence).hexdigest(),
            "b20d3723a136aedc2f96f6f498ad3da0")