Exemplo n.º 1
0
    def test_renamed(self):
        formats = self.__formats
        for i, j in itertools.product(formats[:-1], formats):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            for k in self.__acc_num_files:
                renamer.read_ncbi_acc_num(k, i, j)
            # convert sequence IDs
            input_file = os.path.join(self.__test_dir, 'ncbi_' + i + '.fa')
            with open(self.__output, 'w') as output_fasta:
                for line in renamer.renamed(input_file):
                    output_fasta.write(line)

            example_file = os.path.join(self.__test_dir, 'ncbi_' + j + '.fa')

            for k in (self.__output + '.fai', example_file + '.fai'):
                if os.path.isfile(k):
                    os.unlink(k)

            output_fasta = Fasta(self.__output)
            example_fasta = Fasta(example_file)
            # compare the obtained file to the example
            self.assertEqual(output_fasta.keys(), example_fasta.keys())

        # test for an incorrect format
        with self.assertRaises(SeqRenameError):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            renamer.read_ncbi_acc_num(
                'unknown', 'chr_refseq',
                os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa'))
        with self.assertRaises(SeqRenameError):
            renamer.read_ncbi_acc_num(
                'chr_refseq', 'unknown',
                os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa'))

        # test for an incorrect NCBI accession number dictionary
        with self.assertRaises(IncorrectDictError):
            renamer.read_ncbi_acc_num(self.__chr_incorrect, 'refseq_full',
                                      'chr_refseq')

        # check if sequence versions are removed
        renamer = bioformats.seqname.NcbiFastaSeqRenamer()
        for k in self.__acc_num_files:
            renamer.read_ncbi_acc_num(k,
                                      'chr',
                                      'genbank',
                                      remove_seq_version=True)
        input_file = os.path.join(self.__test_dir, 'ncbi_chr.fa')
        example_file = os.path.join(self.__test_dir, 'ncbi_genbank_nover.fa')
        with open(self.__output, 'w') as output_fasta:
            for line in renamer.renamed(input_file):
                output_fasta.write(line)

        for k in (self.__output + '.fai', example_file + '.fai'):
            if os.path.isfile(k):
                os.unlink(k)

        output_fasta = Fasta(self.__output)
        example_fasta = Fasta(example_file)
        self.assertEqual(output_fasta.keys(), example_fasta.keys())
        os.unlink(example_file + '.fai')
Exemplo n.º 2
0
    def test_renamed(self):
        formats = (
            "refseq_full",
            "genbank_full",
            "refseq_gi",
            "genbank_gi",
            "refseq",
            "genbank",
            "chr_refseq",
            "chr_genbank",
            "chr",
        )
        for i, j in itertools.product(formats[:-1], formats):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            for k in self.__acc_num_files:
                renamer.read_ncbi_acc_num(k, i, j)
            # convert sequence IDs
            input_file = os.path.join(self.__test_dir, "ncbi_" + i + ".fa")
            with open(self.__output, "w") as output_fasta:
                for line in renamer.renamed(input_file):
                    output_fasta.write(line)

            example_file = os.path.join(self.__test_dir, "ncbi_" + j + ".fa")

            for k in (self.__output + ".fai", example_file + ".fai"):
                if os.path.isfile(k):
                    os.unlink(k)

            output_fasta = Fasta(self.__output)
            example_fasta = Fasta(example_file)
            # compare the obtained file to the example
            self.assertEqual(output_fasta.keys, example_fasta.keys)

        # test for an incorrect format
        with self.assertRaises(SeqRenameError):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            renamer.read_ncbi_acc_num("unknown", "chr_refseq", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa"))
        with self.assertRaises(SeqRenameError):
            renamer.read_ncbi_acc_num("chr_refseq", "unknown", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa"))

        # test for an incorrect NCBI accession number dictionary
        with self.assertRaises(IncorrectDictError):
            renamer.read_ncbi_acc_num(self.__chr_incorrect, "refseq_full", "chr_refseq")

        # check if sequence versions are removed
        renamer = bioformats.seqname.NcbiFastaSeqRenamer()
        for k in self.__acc_num_files:
            renamer.read_ncbi_acc_num(k, "chr", "genbank", remove_seq_version=True)
        input_file = os.path.join(self.__test_dir, "ncbi_chr.fa")
        example_file = os.path.join(self.__test_dir, "ncbi_genbank_nover.fa")
        with open(self.__output, "w") as output_fasta:
            for line in renamer.renamed(input_file):
                output_fasta.write(line)

        for k in (self.__output + ".fai", example_file + ".fai"):
            if os.path.isfile(k):
                os.unlink(k)

        output_fasta = Fasta(self.__output)
        example_fasta = Fasta(example_file)
        self.assertEqual(output_fasta.keys, example_fasta.keys)
        os.unlink(example_file + ".fai")

        # remove temporary files and FASTA indices
        os.unlink(self.__output)
        os.unlink(self.__output + ".fai")
        for i in formats:
            os.unlink(os.path.join(self.__test_dir, "ncbi_" + i + ".fa.fai"))