Пример #1
0
    def run(self):
        # todo: filter fasta to keep just the largest contig.

        # run the fasta through barrnap
        fd, barrnap_outputfile = mkstemp()
        self.files_to_cleanup.append(barrnap_outputfile)
        b = Barrnap(self.input_file, self.threads)
        subprocess.check_output(
            b.construct_barrnap_command(barrnap_outputfile), shell=True)

        boundries = b.read_barrnap_output(barrnap_outputfile)

        f = Fasta(self.input_file)
        fragments = f.calc_fragment_coords(boundries)
        f.populate_fragments_from_chromosome(fragments,
                                             self.max_bases_from_ends)

        ff = FragmentFiles(fragments,
                           self.output_directory,
                           fragment_order=self.fragment_order)
        ff.create_fragment_fastas()

        # create a default profile.txt file
        default_profile = ProfileGenerator(self.output_directory,
                                           len(ff.ordered_fragments),
                                           self.dnaa_fasta, self.threads)
        default_profile.write_output_file()
Пример #2
0
    def test_calc_fragment_coords_gz(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa.gz'), False)
        boundries = [[45, 55], [90, 110], [150, 180]]
        fragments = f.calc_fragment_coords(boundries)

        coords = [f.coords for f in fragments]
        self.assertEqual(coords,
                         [[[180, 200], [0, 45]], [[55, 90]], [[110, 150]]])
Пример #3
0
    def test_chop_from_ends(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False)
        fragments = f.calc_fragment_coords([[45, 55], [90, 110], [150, 180]])
        sequences = [str(f.sequence) for f in fragments]

        f.populate_fragments_from_chromosome(fragments, 5)
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences,
                         ['TTTTTNNNAAAAA', 'CCCCCNNNCCCCC', 'GGGGGNNNGGGGG'])
Пример #4
0
    def test_calc_fragment_coords(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False)
        boundries = [[45, 55], [90, 110], [150, 180]]
        fragments = f.calc_fragment_coords(boundries)

        coords = [f.coords for f in fragments]
        self.assertEqual(coords,
                         [[[180, 200], [0, 45]], [[55, 90]], [[110, 150]]])

        f.populate_fragments_from_chromosome(fragments, None)
Пример #5
0
    def run_analysis(self, input_file, p, d):
        # run the fasta through barrnap
        fd, barrnap_outputfile = mkstemp()
        b = Barrnap(input_file, self.threads)
        subprocess.check_output(
           b.construct_barrnap_command(barrnap_outputfile), 
           shell=True)

        boundries = b.read_barrnap_output(barrnap_outputfile)
        
        f = Fasta(input_file, is_circular = self.is_circular)
        fragments = f.calc_fragment_coords( boundries)
        f.populate_fragments_from_chromosome(fragments, self.max_bases_from_ends)
        
        tmpdir = mkdtemp()
        self.dirs_to_cleanup.append(tmpdir)

        ff = FragmentFiles(fragments, tmpdir)
        ff.create_fragment_fastas()
        
         # take each fasta file and blast it against the database
        blast = Blast(d.db_prefix, self.threads)
        
        gat_profile = GATProfile(fragments = [])
        for fasta_file in ff.output_filenames:
            blast_results = blast.run_blast(fasta_file)
            fb = FilterBlast(blast_results, self.min_bit_score, self.min_alignment_length)
            top_result = fb.return_top_result()
            if top_result is None:
                gat_profile.fragments.append('?')
                fasta_file
                
                with open(fasta_file, "r") as fasta_file_fh:
                    with open(self.new_fragments, "a+") as newfrag_fh:
                        newfrag_fh.write(fasta_file_fh.read())
                continue
            else:
                self.top_results.append(top_result) 
            
            if top_result.is_forward():
                gat_profile.fragments.append( str(top_result.subject))
            else:
                gat_profile.fragments.append( str(top_result.subject)+ '\'')
        
        gat_profile.orientate_for_dnaA()
        # lookup the gat_profile to get the number
        tg = TypeGenerator(p, gat_profile)
        type_output_string  =  tg.calculate_type() + "\t" + str(gat_profile)
        if not tg.has_previously_seen:
            with open(self.novel_profiles, "a+") as output_fh:
                output_fh.write(self.db_dir + "\t" + type_output_string + "\n")
        
        return type_output_string
Пример #6
0
    def test_populate_fragments_from_chromosome(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False)
        fragments = f.calc_fragment_coords([[45, 55], [90, 110], [150, 180]])
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences, ["", "", ""])

        f.populate_fragments_from_chromosome(fragments, None)
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences, [
            "TTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
            "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC",
            "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
        ])
Пример #7
0
    def test_chop_from_ends(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False)
        boundries = [
            Operon(45, 55, True),
            Operon(90, 110, False),
            Operon(150, 180, True)
        ]
        fragments = f.calc_fragment_coords(boundries)
        sequences = [str(f.sequence) for f in fragments]

        f.populate_fragments_from_chromosome(fragments, 5)
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences,
                         ['TTTTTNNNAAAAA', 'CCCCCNNNCCCCC', 'GGGGGNNNGGGGG'])
Пример #8
0
 def shrink_files(self):
     output_filenames = []
     # copy profile files to destination
     shutil.copy(os.path.join(self.input_database, 'profile.txt'), self.output_database)
     shutil.copy(os.path.join(self.input_database, 'profile.txt.yml'), self.output_database)
     
     fasta_file_names_compressed = self.get_database_files_compressed()
     fasta_file_names_uncompressed = self.get_database_files()
     fasta_file_names = fasta_file_names_uncompressed + fasta_file_names_compressed
     
     fasta_obj = [ Fasta(f) for f in fasta_file_names]
     for f in fasta_obj:
         fb = FilterBlast(self.blast_results, 1, 1)
         destination_filename = os.path.join(self.output_database, str(f.fragment_number()) + '.fa')
         if len(f.chromosome.seq) < self.target_bases:
             if f.input_file in fasta_file_names_compressed:
                 shutil.copy(f.input_file, destination_filename + '.gz')
                 output_filenames.append(destination_filename + '.gz')
             else:
                 shutil.copy(f.input_file, destination_filename)
                 output_filenames.append(destination_filename)
         else:
             blocks = fb.identify_regions(f.fragment_number(), self.target_bases)
             sequence = ""
             for b in blocks:
                 sequence += f.chromosome.seq[(b[0]):(b[1])]
             record = [SeqRecord(sequence, str(f.fragment_number()) , '', '')]
             SeqIO.write(record, destination_filename, "fasta")
             output_filenames.append(destination_filename)
             
     return self.compress_files(output_filenames)
Пример #9
0
    def find_boundries(self, coords):
        boundries = []

        starting_coords = []
        ending_coords = []
        variable_s = self.five_or_23s(coords)
        for c in coords:
            if (c[2] == 16 and c[3] == '+') or (c[2] == variable_s
                                                and c[3] == '-'):
                # start of ribo
                starting_coords.append(c[0])
            elif (c[2] == 16 and c[3] == '-') or (c[2] == variable_s
                                                  and c[3] == '+'):
                # end of ribo
                ending_coords.append(c[1])

        starting_coords = self.filter_out_close_start_coords(starting_coords)
        ending_coords = self.filter_out_close_end_coords(ending_coords)

        for start_index in range(len(starting_coords)):
            start = starting_coords[start_index]
            if start < 0:
                continue
            for end_index in range(len(ending_coords)):
                end = ending_coords[end_index]
                if end < 0:
                    continue
                if end - start < self.len_70s and end - start > 0:
                    boundries.append([start, end])
                    ending_coords[end_index] = -1
                    starting_coords[start_index] = -1
                    continue

        # check for 70S that goes over the end of the genome and for errors
        remaining_start_coords = [s for s in starting_coords if s >= 0]
        remaining_end_coords = [e for e in ending_coords if e >= 0]
        if len(remaining_start_coords) > 0 and len(remaining_end_coords) > 0:
            chromosome_length = self.chromosome_length
            if self.chromosome_length <= 0:
                chromosome_length = len(
                    Fasta(self.input_file, self.verbose).chromosome)

            for start_index in range(len(remaining_start_coords)):
                start = remaining_start_coords[start_index]
                for end_index in range(len(remaining_end_coords)):
                    end = remaining_end_coords[end_index]
                    if end < 0:
                        continue
                    if (chromosome_length - start
                        ) < self.len_70s and end < self.len_70s and (
                            chromosome_length - start) + end < self.len_70s:
                        boundries.append([start, end])
                        remaining_end_coords[end_index] = -1
                        remaining_start_coords[start_index] = -1
                        continue

        return boundries
Пример #10
0
    def test_populate_fragments_from_chromosome(self):
        f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False)
        boundries = [
            Operon(45, 55, True),
            Operon(90, 110, False),
            Operon(150, 180, True)
        ]
        fragments = f.calc_fragment_coords(boundries)
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences, ["", "", ""])

        f.populate_fragments_from_chromosome(fragments, None)
        sequences = [str(f.sequence) for f in fragments]
        self.assertEqual(sequences, [
            "TTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
            "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC",
            "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
        ])
Пример #11
0
    def test_fasta_get_largest_contig(self):
        f = Fasta(os.path.join(data_dir, 'get_largest_contig.fa'), False)
        largest_contig_record = f.get_chromosome_from_fasta()

        self.assertEqual(len(largest_contig_record.seq), 60)
Пример #12
0
 def populate_fragments_from_chromosome(self, input_file, boundries):
     f = Fasta(input_file, self.verbose, is_circular=self.is_circular)
     fragments = f.calc_fragment_coords(boundries)
     f.populate_fragments_from_chromosome(fragments,
                                          self.max_bases_from_ends)
     return fragments