예제 #1
0
 def test_filter_out_alignments_with_too_much_missing_data(self):
     preprocessfasta = PreProcessFasta(
         'gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
     preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
         'output.aln', 1)
     self.assertTrue(
         filecmp.cmp(
             'output.aln',
             'gubbins/tests/data/preprocessfasta/expected_missing_data.aln')
     )
예제 #2
0
 def test_input_file_with_multiple_duplicate_sequences(self):   
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
     self.assertEqual(preprocessfasta.hash_sequences(), 
      {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
       b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
       
     self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
     
     preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
     self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
예제 #3
0
 def test_filter_out_alignments_with_too_much_missing_data(self):
     preprocessfasta = PreProcessFasta(
         os.path.join(data_dir, 'preprocessfasta/missing_data.aln'), False,
         5)
     preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
         'output.aln', 1)
     self.assertTrue(
         filecmp.cmp(
             'output.aln',
             os.path.join(data_dir,
                          'preprocessfasta/expected_missing_data.aln')))
예제 #4
0
    def test_input_file_with_all_duplicate_sequences(self):
        preprocessfasta = PreProcessFasta(
            os.path.join(data_dir, 'preprocessfasta/all_same_sequence.aln'))
        self.assertEqual(
            preprocessfasta.hash_sequences(), {
                b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.":
                ['sample1', 'sample2', 'sample3', 'sample4']
            })

        self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),
                         ['sample1', 'sample2', 'sample3'])
예제 #5
0
 def test_input_file_with_all_duplicate_sequences(self):   
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln')
     self.assertEqual(preprocessfasta.hash_sequences(), 
      {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1',
                                                     'sample2',
                                                     'sample3',
                                                     'sample4']})
                                                     
     self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1',
                                                     'sample2',
                                                     'sample3'])
예제 #6
0
    def test_input_file_with_no_duplicate_sequences(self):
        preprocessfasta = PreProcessFasta(
            os.path.join(data_dir, 'preprocessfasta/no_duplicates.aln'))
        self.assertEqual(
            preprocessfasta.hash_sequences(), {
                b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1'],
                b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec':
                ['sample4'],
                b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9':
                ['sample2'],
                b'\xdf\xedM\xdf1\xf2PPO\xc1\xf54T?\xdb\xa2': ['sample3']
            })
        self.assertEqual(
            preprocessfasta.calculate_sequences_missing_data_percentage(), {
                'sample1': 0.0,
                'sample2': 0.0,
                'sample3': 0.0,
                'sample4': 0.0
            })

        self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(), [])

        preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
            'output.aln', 1)
        self.assertTrue(
            filecmp.cmp(
                'output.aln',
                os.path.join(data_dir, 'preprocessfasta/no_duplicates.aln')))
예제 #7
0
 def test_dont_filter_input_file_with_multiple_duplicate_sequences(self):   
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
     self.assertEqual(preprocessfasta.hash_sequences(), 
      {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
       b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
       
     self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
     
     preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0)
     self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln'))    
예제 #8
0
 def test_input_file_with_one_duplicate_sequences(self):   
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/one_duplicate.aln')
     self.assertEqual(preprocessfasta.hash_sequences(), 
      {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
       b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'],
       b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2']})
       
     self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1'])
     
     preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
     self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
예제 #9
0
  def test_input_file_with_no_duplicate_sequences(self):   
      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/no_duplicates.aln')
      self.assertEqual(preprocessfasta.hash_sequences(), 
       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1'],
        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'],
        b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2'],
        b'\xdf\xedM\xdf1\xf2PPO\xc1\xf54T?\xdb\xa2': ['sample3']})
      self.assertEqual(preprocessfasta.calculate_sequences_missing_data_percentage(), {'sample1': 0.0,
 'sample2': 0.0,
 'sample3': 0.0,
 'sample4': 0.0})
        
      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[])

      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
예제 #10
0
    def parse_and_run(self):
        # Default parameters
        raxml_executable_obj = RAxMLExecutable(self.args.threads, self.args.raxml_model, self.args.verbose)

        fasttree_executables = ["FastTree", "fasttree"]
        FASTTREE_EXEC = GubbinsCommon.choose_executable(fasttree_executables)

        FASTTREE_PARAMS = "-nosupport -gtr -gamma -nt"
        GUBBINS_EXEC = "gubbins"

        GUBBINS_BUNDLED_EXEC = "../src/gubbins"

        # check that all the external executable dependancies are available
        if GubbinsCommon.which(GUBBINS_EXEC) is None:
            GUBBINS_EXEC = GubbinsCommon.use_bundled_exec(GUBBINS_EXEC, GUBBINS_BUNDLED_EXEC)
            if GubbinsCommon.which(GUBBINS_EXEC) is None:
                sys.exit(GUBBINS_EXEC + " is not in your path")

        if self.args.tree_builder == "fasttree" or self.args.tree_builder == "hybrid":
            if GubbinsCommon.which(FASTTREE_EXEC) is None:
                sys.exit("FastTree is not in your path")

        if self.args.converge_method not in ["weighted_robinson_foulds", "robinson_foulds", "recombination"]:
            sys.exit(
                "Please choose weighted_robinson_foulds, robinson_foulds or recombination for the --converge_method option"
            )

        if (
            GubbinsCommon.does_file_exist(self.args.alignment_filename, "Alignment File") == 0
            or not ValidateFastaAlignment(self.args.alignment_filename).is_input_fasta_file_valid()
        ):
            sys.exit("There is a problem with your input fasta file so nothing can be done until you fix it")

        if (
            self.args.starting_tree is not None
            and self.args.starting_tree != ""
            and (
                GubbinsCommon.does_file_exist(self.args.starting_tree, "Starting Tree") == 0
                or GubbinsCommon.is_input_starting_tree_valid(self.args.starting_tree)
            )
        ):
            sys.exit("The starting tree is invalid")

        if (
            self.args.starting_tree is not None
            and self.args.starting_tree != ""
            and GubbinsCommon.do_the_names_match_the_fasta_file(self.args.starting_tree, self.args.alignment_filename)
            == 0
        ):
            sys.exit("The names in the starting tree dont match the names in the fasta file")

        GubbinsCommon.check_and_fix_window_size(self)

        current_time = ""
        if self.args.use_time_stamp > 0:
            current_time = str(int(time.time())) + "."
            if self.args.verbose > 0:
                print(current_time)

        # get the base filename
        (base_directory, base_filename) = os.path.split(self.args.alignment_filename)
        (base_filename_without_ext, extension) = os.path.splitext(base_filename)
        starting_base_filename = base_filename

        # put a filtered copy into a temp directory and work from that
        temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())

        pre_process_fasta = PreProcessFasta(
            self.args.alignment_filename, self.args.verbose, self.args.filter_percentage
        )
        taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
            temp_working_dir + "/" + starting_base_filename, self.args.remove_identical_sequences
        )

        self.args.alignment_filename = temp_working_dir + "/" + starting_base_filename

        # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from the tree
        self.args.starting_tree = GubbinsCommon.filter_out_removed_taxa_from_tree_and_return_new_file(
            self.args.starting_tree, temp_working_dir, taxa_removed
        )

        # get the base filename
        (base_directory, base_filename) = os.path.split(self.args.alignment_filename)
        (base_filename_without_ext, extension) = os.path.splitext(base_filename)
        starting_base_filename = base_filename

        if len(base_filename) > 115:
            sys.exit(
                "Your filename is too long for RAxML at "
                + str(len(base_filename))
                + " characters, please shorten it to less than 115 characters"
            )

        # find all snp sites
        if self.args.verbose > 0:
            print(GUBBINS_EXEC + " " + self.args.alignment_filename)
        try:
            subprocess.check_call([GUBBINS_EXEC, self.args.alignment_filename])
        except:
            sys.exit("Gubbins crashed, please ensure you have enough free memory")

        if self.args.verbose > 0:
            print(int(time.time()))

        GubbinsCommon.reconvert_fasta_file(
            starting_base_filename + ".gaps.snp_sites.aln", starting_base_filename + ".start"
        )

        number_of_sequences = GubbinsCommon.number_of_sequences_in_alignment(self.args.alignment_filename)
        if number_of_sequences < 3:
            sys.exit("4 or more sequences are required.")

        latest_file_name = "latest_tree." + base_filename_without_ext + "." + str(current_time) + "tre"
        tree_file_names = []

        tree_building_command = ""
        gubbins_command = ""
        previous_tree_name = ""
        current_tree_name = ""
        max_iteration = 1

        raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions(
            base_filename_without_ext, current_time, starting_base_filename, self.args.iterations
        )
        # cleanup RAxML intermediate files
        if self.args.no_cleanup == 0 or self.args.no_cleanup is None:
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose)

        if GubbinsCommon.check_file_exist_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) == 1:
            sys.exit(
                "Intermediate files from a previous run exist. Please rerun without the --no_cleanup option to automatically delete them or with the --use_time_stamp to add a unique prefix."
            )

        for i in range(1, self.args.iterations + 1):
            max_iteration += 1

            if self.args.tree_builder == "hybrid":
                if i == 1:
                    previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i)
                    current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i)
                    tree_building_command = GubbinsCommon.fasttree_tree_building_command(
                        i,
                        self.args.starting_tree,
                        current_tree_name,
                        base_filename,
                        previous_tree_name,
                        FASTTREE_EXEC,
                        FASTTREE_PARAMS,
                        base_filename,
                    )
                    gubbins_command = GubbinsCommon.fasttree_gubbins_command(
                        base_filename,
                        starting_base_filename + ".gaps",
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )

                elif i == 2:
                    previous_tree_name = current_tree_name
                    current_tree_name = GubbinsCommon.raxml_current_tree_name(
                        base_filename_without_ext, current_time, i
                    )
                    tree_building_command = GubbinsCommon.raxml_tree_building_command(
                        i,
                        base_filename_without_ext,
                        base_filename,
                        current_time,
                        raxml_executable_obj.tree_building_command(),
                        previous_tree_name,
                        self.args.verbose,
                    )
                    gubbins_command = GubbinsCommon.raxml_gubbins_command(
                        base_filename_without_ext,
                        starting_base_filename + ".gaps",
                        current_time,
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )
                else:
                    previous_tree_name = GubbinsCommon.raxml_previous_tree_name(
                        base_filename_without_ext, base_filename, current_time, i
                    )
                    current_tree_name = GubbinsCommon.raxml_current_tree_name(
                        base_filename_without_ext, current_time, i
                    )
                    tree_building_command = GubbinsCommon.raxml_tree_building_command(
                        i,
                        base_filename_without_ext,
                        base_filename,
                        current_time,
                        raxml_executable_obj.tree_building_command(),
                        previous_tree_name,
                        self.args.verbose,
                    )
                    gubbins_command = GubbinsCommon.raxml_gubbins_command(
                        base_filename_without_ext,
                        starting_base_filename + ".gaps",
                        current_time,
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )

            elif self.args.tree_builder == "raxml":
                previous_tree_name = GubbinsCommon.raxml_previous_tree_name(
                    base_filename_without_ext, base_filename, current_time, i
                )
                current_tree_name = GubbinsCommon.raxml_current_tree_name(base_filename_without_ext, current_time, i)
                tree_building_command = GubbinsCommon.raxml_tree_building_command(
                    i,
                    base_filename_without_ext,
                    base_filename,
                    current_time,
                    raxml_executable_obj.tree_building_command(),
                    previous_tree_name,
                    self.args.verbose,
                )

                gubbins_command = GubbinsCommon.raxml_gubbins_command(
                    base_filename_without_ext,
                    starting_base_filename + ".gaps",
                    current_time,
                    i,
                    self.args.alignment_filename,
                    GUBBINS_EXEC,
                    self.args.min_snps,
                    self.args.alignment_filename,
                    self.args.min_window_size,
                    self.args.max_window_size,
                )

            elif self.args.tree_builder == "fasttree":
                previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i)
                if i == 1:
                    previous_tree_name = base_filename
                current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i)

                tree_building_command = GubbinsCommon.fasttree_tree_building_command(
                    i,
                    self.args.starting_tree,
                    current_tree_name,
                    previous_tree_name,
                    previous_tree_name,
                    FASTTREE_EXEC,
                    FASTTREE_PARAMS,
                    base_filename,
                )
                gubbins_command = GubbinsCommon.fasttree_gubbins_command(
                    base_filename,
                    starting_base_filename + ".gaps",
                    i,
                    self.args.alignment_filename,
                    GUBBINS_EXEC,
                    self.args.min_snps,
                    self.args.alignment_filename,
                    self.args.min_window_size,
                    self.args.max_window_size,
                )

            if self.args.verbose > 0:
                print(tree_building_command)

            if self.args.starting_tree is not None and i == 1:
                shutil.copyfile(self.args.starting_tree, current_tree_name)
            else:
                try:
                    subprocess.check_call(tree_building_command, shell=True)
                except:
                    sys.exit("Failed while building the tree.")

            if self.args.verbose > 0:
                print(int(time.time()))

            GubbinsCommon.reroot_tree(str(current_tree_name), self.args.outgroup)

            try:
                raxml_seq_recon = RAxMLSequenceReconstruction(
                    starting_base_filename + ".snp_sites.aln",
                    current_tree_name,
                    starting_base_filename + ".seq.joint.txt",
                    current_tree_name,
                    raxml_executable_obj.internal_sequence_reconstruction_command(),
                    self.args.verbose,
                )
                raxml_seq_recon.reconstruct_ancestor_sequences()

            except:
                sys.exit("Failed while running RAxML internal sequence reconstruction")

            shutil.copyfile(starting_base_filename + ".start", starting_base_filename + ".gaps.snp_sites.aln")
            GubbinsCommon.reinsert_gaps_into_fasta_file(
                starting_base_filename + ".seq.joint.txt",
                starting_base_filename + ".gaps.vcf",
                starting_base_filename + ".gaps.snp_sites.aln",
            )

            if (
                GubbinsCommon.does_file_exist(starting_base_filename + ".gaps.snp_sites.aln", "Alignment File") == 0
                or not ValidateFastaAlignment(
                    starting_base_filename + ".gaps.snp_sites.aln"
                ).is_input_fasta_file_valid()
            ):
                sys.exit(
                    "There is a problem with your FASTA file after running RAxML internal sequence reconstruction. Please check this intermediate file is valid: "
                    + str(starting_base_filename)
                    + ".gaps.snp_sites.aln"
                )

            if self.args.verbose > 0:
                print(int(time.time()))

            if self.args.verbose > 0:
                print(gubbins_command)
            try:
                subprocess.check_call(gubbins_command, shell=True)
            except:
                sys.exit("Failed while running Gubbins. Please ensure you have enough free memory")
            if self.args.verbose > 0:
                print(int(time.time()))

            tree_file_names.append(current_tree_name)
            if i > 2:
                if self.args.converge_method == "recombination":
                    current_recomb_file, previous_recomb_files = GubbinsCommon.get_recombination_files(
                        base_filename_without_ext,
                        current_time,
                        max_iteration - 1,
                        starting_base_filename,
                        self.args.tree_builder,
                    )

                    if GubbinsCommon.have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files):
                        if self.args.verbose > 0:
                            print("Recombinations observed before so stopping: " + str(current_tree_name))
                        break
                else:
                    if GubbinsCommon.has_tree_been_seen_before(tree_file_names, self.args.converge_method):
                        if self.args.verbose > 0:
                            print("Tree observed before so stopping: " + str(current_tree_name))
                        break

        # cleanup intermediate files
        if self.args.no_cleanup == 0 or self.args.no_cleanup is None:
            max_intermediate_iteration = max_iteration - 1

            raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions(
                base_filename_without_ext, current_time, starting_base_filename, max_intermediate_iteration
            )
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose)

            fasttree_files_to_delete = GubbinsCommon.fasttree_regex_for_file_deletions(
                starting_base_filename, max_intermediate_iteration
            )
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", fasttree_files_to_delete, self.args.verbose)
            shutil.rmtree(temp_working_dir)

            GubbinsCommon.delete_files_based_on_list_of_regexes(
                ".", [GubbinsCommon.starting_files_regex("^" + starting_base_filename), "^log.txt"], self.args.verbose
            )

        output_filenames_to_final_filenames = {}
        if self.args.prefix is None:
            self.args.prefix = base_filename_without_ext
        if self.args.tree_builder == "hybrid" or self.args.tree_builder == "raxml":
            output_filenames_to_final_filenames = GubbinsCommon.translation_of_raxml_filenames_to_final_filenames(
                base_filename_without_ext, current_time, max_iteration - 1, self.args.prefix
            )
        else:
            output_filenames_to_final_filenames = GubbinsCommon.translation_of_fasttree_filenames_to_final_filenames(
                starting_base_filename, max_iteration - 1, self.args.prefix
            )
        GubbinsCommon.rename_files(output_filenames_to_final_filenames)
        GubbinsCommon.remove_internal_node_labels_from_tree(
            str(self.args.prefix) + ".final_tree.tre", str(self.args.prefix) + ".no_internal_labels.final_tree.tre"
        )
        shutil.move(
            str(self.args.prefix) + ".no_internal_labels.final_tree.tre", str(self.args.prefix) + ".final_tree.tre"
        )
예제 #11
0
 def test_filter_out_alignments_with_too_much_missing_data(self):
   preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
   preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
   self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln'))
   self.cleanup()           
예제 #12
0
파일: common.py 프로젝트: puethe/gubbins
def parse_and_run(input_args, program_description=""):
    """Main function of the Gubbins program"""
    start_time = time.time()
    current_directory = os.getcwd()
    printer = utils.VerbosePrinter(True, "\n")

    # Check if the Gubbins C-program is available. If so, print a welcome message. Otherwise exit.
    gubbins_exec = 'gubbins'
    if utils.which(gubbins_exec) is None:
        # Check if the Gubbins C-program is available in its source directory (for tests/Travis)
        gubbins_bundled_exec = os.path.abspath(os.path.join(current_directory, '../src/gubbins'))
        if utils.which(gubbins_bundled_exec) is None:
            sys.exit(gubbins_exec + " is not in your path")
        else:
            gubbins_exec = utils.replace_executable(gubbins_exec, gubbins_bundled_exec)
    program_version = ""
    try:
        program_version = str(pkg_resources.get_distribution(gubbins_exec).version)
    except pkg_resources.RequirementParseError:
        pass
    printer.print(["\n--- Gubbins " + program_version + " ---\n", program_description])

    # Initialize tree builder and ancestral sequence reconstructor; check if all required dependencies are available
    printer.print("\nChecking dependencies...")
    current_tree_name = input_args.starting_tree
    tree_file_names = []
    internal_node_label_prefix = "internal_"
    if input_args.tree_builder == "fasttree" or input_args.tree_builder == "hybrid":
        tree_builder = FastTree(input_args.verbose)
        sequence_reconstructor = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix,
                                       input_args.verbose)
        alignment_suffix = ".snp_sites.aln"
    elif input_args.tree_builder == "raxml":
        tree_builder = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose)
        sequence_reconstructor = tree_builder
        alignment_suffix = ".phylip"
    else:
        tree_builder = IQTree(input_args.threads, internal_node_label_prefix, input_args.verbose)
        sequence_reconstructor = tree_builder
        alignment_suffix = ".phylip"
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Check if the input files exist and have the right format
    printer.print("\nChecking input files...")
    if not os.path.exists(input_args.alignment_filename) \
            or not ValidateFastaAlignment(input_args.alignment_filename).is_input_fasta_file_valid():
        sys.exit("There input alignment file does not exist or has an invalid format")
    if input_args.starting_tree is not None and input_args.starting_tree != "" \
            and (not os.path.exists(input_args.starting_tree) or not is_starting_tree_valid(input_args.starting_tree)):
        sys.exit("The starting tree does not exist or has an invalid format")
    if input_args.starting_tree is not None and input_args.starting_tree != "" \
            and not do_the_names_match_the_fasta_file(input_args.starting_tree, input_args.alignment_filename):
        sys.exit("The names in the starting tree do not match the names in the alignment file")
    if number_of_sequences_in_alignment(input_args.alignment_filename) < 3:
        sys.exit("3 or more sequences are required.")

    # Check - and potentially correct - further input parameters
    check_and_fix_window_size(input_args)

    # Get the base filename
    (base_directory, base_filename) = os.path.split(input_args.alignment_filename)
    (basename, extension) = os.path.splitext(base_filename)
    if input_args.use_time_stamp:
        time_stamp = str(int(time.time()))
        basename = basename + "." + time_stamp
    snp_alignment_filename = base_filename + ".snp_sites.aln"
    gaps_alignment_filename = base_filename + ".gaps.snp_sites.aln"
    gaps_vcf_filename = base_filename + ".gaps.vcf"
    joint_sequences_filename = base_filename + ".seq.joint.aln"

    # Check if intermediate files from a previous run exist
    intermediate_files = [basename + ".iteration_"]
    if not input_args.no_cleanup:
        utils.delete_files(".", intermediate_files, "", input_args.verbose)
    if utils.do_files_exist(".", intermediate_files, "", input_args.verbose):
        sys.exit("Intermediate files from a previous run exist. Please rerun without the --no_cleanup option "
                 "to automatically delete them or with the --use_time_stamp to add a unique prefix.")
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Filter the input alignment and save as temporary alignment file
    printer.print("\nFiltering input alignment...")
    temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
    temp_alignment_filename = temp_working_dir + "/" + base_filename

    pre_process_fasta = PreProcessFasta(input_args.alignment_filename, input_args.verbose,
                                        input_args.filter_percentage)
    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
        temp_alignment_filename, input_args.remove_identical_sequences)
    input_args.alignment_filename = temp_alignment_filename

    # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from it
    if input_args.starting_tree:
        (tree_base_directory, tree_base_filename) = os.path.split(input_args.starting_tree)
        temp_starting_tree = temp_working_dir + '/' + tree_base_filename
        filter_out_removed_taxa_from_tree(input_args.starting_tree, temp_starting_tree, taxa_removed)
        input_args.starting_tree = temp_starting_tree
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Find all SNP sites with Gubbins
    gubbins_command = " ".join([gubbins_exec, input_args.alignment_filename])
    printer.print(["\nRunning Gubbins to detect SNPs...", gubbins_command])
    try:
        subprocess.check_call(gubbins_command, shell=True)
    except subprocess.SubprocessError:
        sys.exit("Gubbins crashed, please ensure you have enough free memory")
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
    reconvert_fasta_file(snp_alignment_filename, snp_alignment_filename)
    reconvert_fasta_file(gaps_alignment_filename, base_filename + ".start")

    # Start the main loop
    printer.print("\nEntering the main loop.")
    for i in range(1, input_args.iterations+1):
        printer.print("\n*** Iteration " + str(i) + " ***")

        # 1.1. Construct the tree-building command depending on the iteration and employed options
        if i == 2 and input_args.tree_builder == "hybrid":
            # Switch to RAxML
            tree_builder = sequence_reconstructor
            alignment_suffix = ".phylip"

        if i == 1:
            previous_tree_name = input_args.starting_tree
            alignment_filename = base_filename + alignment_suffix
        else:
            previous_tree_name = current_tree_name
            alignment_filename = previous_tree_name + alignment_suffix

        current_basename = basename + ".iteration_" + str(i)
        current_tree_name = current_basename + ".tre"
        if previous_tree_name:
            tree_building_command = tree_builder.tree_building_command(
                os.path.abspath(alignment_filename), os.path.abspath(previous_tree_name), current_basename)
        else:
            tree_building_command = tree_builder.tree_building_command(
                os.path.abspath(alignment_filename), "", current_basename)
        built_tree = temp_working_dir + "/" + tree_builder.tree_prefix + current_basename + tree_builder.tree_suffix

        # 1.2. Construct the phylogenetic tree
        if input_args.starting_tree is not None and i == 1:
            printer.print("\nCopying the starting tree...")
            shutil.copyfile(input_args.starting_tree, current_tree_name)
        else:
            printer.print(["\nConstructing the phylogenetic tree with " + tree_builder.executable + "...",
                           tree_building_command])
            os.chdir(temp_working_dir)
            try:
                subprocess.check_call(tree_building_command, shell=True)
            except subprocess.SubprocessError:
                sys.exit("Failed while building the tree.")
            os.chdir(current_directory)
            shutil.copyfile(built_tree, current_tree_name)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 2. Re-root the tree
        reroot_tree(str(current_tree_name), input_args.outgroup)
        temp_rooted_tree = temp_working_dir + "/" + current_tree_name + ".rooted"
        if input_args.tree_builder == "iqtree":
            shutil.copyfile(current_tree_name, temp_rooted_tree)
        else:
            root_tree(current_tree_name, temp_rooted_tree)

        # 3.1. Construct the command for ancestral state reconstruction depending on the iteration and employed options
        ancestral_sequence_basename = current_basename + ".internal"
        sequence_reconstruction_command = sequence_reconstructor.internal_sequence_reconstruction_command(
            os.path.abspath(base_filename + alignment_suffix), os.path.abspath(temp_rooted_tree),
            ancestral_sequence_basename)
        raw_internal_sequence_filename \
            = temp_working_dir + "/" + sequence_reconstructor.asr_prefix \
            + ancestral_sequence_basename + sequence_reconstructor.asr_suffix
        processed_internal_sequence_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".aln"
        raw_internal_rooted_tree_filename \
            = temp_working_dir + "/" + sequence_reconstructor.asr_tree_prefix \
            + ancestral_sequence_basename + sequence_reconstructor.asr_tree_suffix

        # 3.2. Reconstruct the ancestral sequence
        printer.print(["\nReconstructing ancestral sequences with " + sequence_reconstructor.executable + "...",
                       sequence_reconstruction_command])
        os.chdir(temp_working_dir)
        try:
            subprocess.check_call(sequence_reconstruction_command, shell=True)
        except subprocess.SubprocessError:
            sys.exit("Failed while reconstructing the ancestral sequences.")
        os.chdir(current_directory)

        # 3.3. Join ancestral sequences with given sequences
        current_tree_name_with_internal_nodes = current_tree_name + ".internal"
        sequence_reconstructor.convert_raw_ancestral_states_to_fasta(raw_internal_sequence_filename,
                                                                     processed_internal_sequence_filename)
        concatenate_fasta_files([snp_alignment_filename, processed_internal_sequence_filename],
                                joint_sequences_filename)
        transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree,
                                              current_tree_name_with_internal_nodes, sequence_reconstructor)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 4. Reinsert gaps (cp15 note: something is wonky here, the process is at the very least terribly inefficient)
        printer.print("\nReinserting gaps into the alignment...")
        shutil.copyfile(base_filename + ".start", gaps_alignment_filename)
        reinsert_gaps_into_fasta_file(joint_sequences_filename, gaps_vcf_filename, gaps_alignment_filename)
        if not os.path.exists(gaps_alignment_filename) \
                or not ValidateFastaAlignment(gaps_alignment_filename).is_input_fasta_file_valid():
            sys.exit("There is a problem with your FASTA file after running internal sequence reconstruction. "
                     "Please check this intermediate file is valid: " + gaps_alignment_filename)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 5. Detect recombination sites with Gubbins (cp15 note: copy file with internal nodes back and forth to
        # ensure all created files have the desired name structure and to avoid fiddling with the Gubbins C program)
        shutil.copyfile(current_tree_name_with_internal_nodes, current_tree_name)
        gubbins_command = create_gubbins_command(
            gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name,
            input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size)
        printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command])
        try:
            subprocess.check_call(gubbins_command, shell=True)
        except subprocess.SubprocessError:
            sys.exit("Failed while running Gubbins. Please ensure you have enough free memory")
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
        shutil.copyfile(current_tree_name, current_tree_name_with_internal_nodes)

        # 6. Check for convergence
        printer.print("\nChecking for convergence...")
        remove_internal_node_labels_from_tree(current_tree_name_with_internal_nodes, current_tree_name)
        tree_file_names.append(current_tree_name)
        if i > 1:
            if input_args.converge_method == 'recombination':
                current_recomb_file, previous_recomb_files = get_recombination_files(tree_file_names)
                if have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files):
                    printer.print("Convergence after " + str(i) + " iterations: Recombinations observed before.")
                    break
            else:
                if has_tree_been_seen_before(tree_file_names, input_args.converge_method):
                    printer.print("Convergence after " + str(i) + " iterations: Tree observed before.")
                    break
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
    else:
        printer.print("Maximum number of iterations (" + str(input_args.iterations) + ") reached.")
    printer.print("\nExiting the main loop.")

    # Create the final output
    printer.print("\nCreating the final output...")
    if input_args.prefix is None:
        input_args.prefix = basename
    output_filenames_to_final_filenames = translation_of_filenames_to_final_filenames(
        current_tree_name, input_args.prefix)
    utils.rename_files(output_filenames_to_final_filenames)

    # Cleanup intermediate files
    if not input_args.no_cleanup:
        shutil.rmtree(temp_working_dir)
        utils.delete_files(".", tree_file_names[:-1], intermediate_files_regex(), input_args.verbose)
        utils.delete_files(".", [base_filename], starting_files_regex(), input_args.verbose)
    printer.print("...finished. Total run time: {:.2f} s".format(time.time() - start_time))