def test_input_file_with_no_duplicate_sequences(self): preprocessfasta = PreProcessFasta( os.path.join(data_dir, 'preprocessfasta/no_duplicates.aln')) self.assertEqual( preprocessfasta.hash_sequences(), { b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1'], b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'], b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2'], b'\xdf\xedM\xdf1\xf2PPO\xc1\xf54T?\xdb\xa2': ['sample3'] }) self.assertEqual( preprocessfasta.calculate_sequences_missing_data_percentage(), { 'sample1': 0.0, 'sample2': 0.0, 'sample3': 0.0, 'sample4': 0.0 }) self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(), []) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( 'output.aln', 1) self.assertTrue( filecmp.cmp( 'output.aln', os.path.join(data_dir, 'preprocessfasta/no_duplicates.aln')))
def test_dont_filter_input_file_with_multiple_duplicate_sequences(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln') self.assertEqual(preprocessfasta.hash_sequences(), {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'], b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']}) self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2']) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0) self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln'))
def test_filter_out_alignments_with_too_much_missing_data(self): preprocessfasta = PreProcessFasta( 'gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( 'output.aln', 1) self.assertTrue( filecmp.cmp( 'output.aln', 'gubbins/tests/data/preprocessfasta/expected_missing_data.aln') )
def test_input_file_with_multiple_duplicate_sequences(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln') self.assertEqual(preprocessfasta.hash_sequences(), {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'], b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']}) self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2']) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
def test_filter_out_alignments_with_too_much_missing_data(self): preprocessfasta = PreProcessFasta( os.path.join(data_dir, 'preprocessfasta/missing_data.aln'), False, 5) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( 'output.aln', 1) self.assertTrue( filecmp.cmp( 'output.aln', os.path.join(data_dir, 'preprocessfasta/expected_missing_data.aln')))
def test_input_file_with_one_duplicate_sequences(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/one_duplicate.aln') self.assertEqual(preprocessfasta.hash_sequences(), {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'], b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'], b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2']}) self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1']) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1) self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
def test_input_file_with_no_duplicate_sequences(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/no_duplicates.aln') self.assertEqual(preprocessfasta.hash_sequences(), {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1'], b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample4'], b'\xc3n`\xf5t\x00\x1e\xf3\xde\nU\x1f\x95\x0b\xdb9': ['sample2'], b'\xdf\xedM\xdf1\xf2PPO\xc1\xf54T?\xdb\xa2': ['sample3']}) self.assertEqual(preprocessfasta.calculate_sequences_missing_data_percentage(), {'sample1': 0.0, 'sample2': 0.0, 'sample3': 0.0, 'sample4': 0.0}) self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[]) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
def parse_and_run(self): # Default parameters raxml_executable_obj = RAxMLExecutable(self.args.threads, self.args.raxml_model, self.args.verbose) fasttree_executables = ["FastTree", "fasttree"] FASTTREE_EXEC = GubbinsCommon.choose_executable(fasttree_executables) FASTTREE_PARAMS = "-nosupport -gtr -gamma -nt" GUBBINS_EXEC = "gubbins" GUBBINS_BUNDLED_EXEC = "../src/gubbins" # check that all the external executable dependancies are available if GubbinsCommon.which(GUBBINS_EXEC) is None: GUBBINS_EXEC = GubbinsCommon.use_bundled_exec(GUBBINS_EXEC, GUBBINS_BUNDLED_EXEC) if GubbinsCommon.which(GUBBINS_EXEC) is None: sys.exit(GUBBINS_EXEC + " is not in your path") if self.args.tree_builder == "fasttree" or self.args.tree_builder == "hybrid": if GubbinsCommon.which(FASTTREE_EXEC) is None: sys.exit("FastTree is not in your path") if self.args.converge_method not in ["weighted_robinson_foulds", "robinson_foulds", "recombination"]: sys.exit( "Please choose weighted_robinson_foulds, robinson_foulds or recombination for the --converge_method option" ) if ( GubbinsCommon.does_file_exist(self.args.alignment_filename, "Alignment File") == 0 or not ValidateFastaAlignment(self.args.alignment_filename).is_input_fasta_file_valid() ): sys.exit("There is a problem with your input fasta file so nothing can be done until you fix it") if ( self.args.starting_tree is not None and self.args.starting_tree != "" and ( GubbinsCommon.does_file_exist(self.args.starting_tree, "Starting Tree") == 0 or GubbinsCommon.is_input_starting_tree_valid(self.args.starting_tree) ) ): sys.exit("The starting tree is invalid") if ( self.args.starting_tree is not None and self.args.starting_tree != "" and GubbinsCommon.do_the_names_match_the_fasta_file(self.args.starting_tree, self.args.alignment_filename) == 0 ): sys.exit("The names in the starting tree dont match the names in the fasta file") GubbinsCommon.check_and_fix_window_size(self) current_time = "" if self.args.use_time_stamp > 0: current_time = str(int(time.time())) + "." if self.args.verbose > 0: print(current_time) # get the base filename (base_directory, base_filename) = os.path.split(self.args.alignment_filename) (base_filename_without_ext, extension) = os.path.splitext(base_filename) starting_base_filename = base_filename # put a filtered copy into a temp directory and work from that temp_working_dir = tempfile.mkdtemp(dir=os.getcwd()) pre_process_fasta = PreProcessFasta( self.args.alignment_filename, self.args.verbose, self.args.filter_percentage ) taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( temp_working_dir + "/" + starting_base_filename, self.args.remove_identical_sequences ) self.args.alignment_filename = temp_working_dir + "/" + starting_base_filename # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from the tree self.args.starting_tree = GubbinsCommon.filter_out_removed_taxa_from_tree_and_return_new_file( self.args.starting_tree, temp_working_dir, taxa_removed ) # get the base filename (base_directory, base_filename) = os.path.split(self.args.alignment_filename) (base_filename_without_ext, extension) = os.path.splitext(base_filename) starting_base_filename = base_filename if len(base_filename) > 115: sys.exit( "Your filename is too long for RAxML at " + str(len(base_filename)) + " characters, please shorten it to less than 115 characters" ) # find all snp sites if self.args.verbose > 0: print(GUBBINS_EXEC + " " + self.args.alignment_filename) try: subprocess.check_call([GUBBINS_EXEC, self.args.alignment_filename]) except: sys.exit("Gubbins crashed, please ensure you have enough free memory") if self.args.verbose > 0: print(int(time.time())) GubbinsCommon.reconvert_fasta_file( starting_base_filename + ".gaps.snp_sites.aln", starting_base_filename + ".start" ) number_of_sequences = GubbinsCommon.number_of_sequences_in_alignment(self.args.alignment_filename) if number_of_sequences < 3: sys.exit("4 or more sequences are required.") latest_file_name = "latest_tree." + base_filename_without_ext + "." + str(current_time) + "tre" tree_file_names = [] tree_building_command = "" gubbins_command = "" previous_tree_name = "" current_tree_name = "" max_iteration = 1 raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions( base_filename_without_ext, current_time, starting_base_filename, self.args.iterations ) # cleanup RAxML intermediate files if self.args.no_cleanup == 0 or self.args.no_cleanup is None: GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) if GubbinsCommon.check_file_exist_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) == 1: sys.exit( "Intermediate files from a previous run exist. Please rerun without the --no_cleanup option to automatically delete them or with the --use_time_stamp to add a unique prefix." ) for i in range(1, self.args.iterations + 1): max_iteration += 1 if self.args.tree_builder == "hybrid": if i == 1: previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i) current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i) tree_building_command = GubbinsCommon.fasttree_tree_building_command( i, self.args.starting_tree, current_tree_name, base_filename, previous_tree_name, FASTTREE_EXEC, FASTTREE_PARAMS, base_filename, ) gubbins_command = GubbinsCommon.fasttree_gubbins_command( base_filename, starting_base_filename + ".gaps", i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif i == 2: previous_tree_name = current_tree_name current_tree_name = GubbinsCommon.raxml_current_tree_name( base_filename_without_ext, current_time, i ) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) else: previous_tree_name = GubbinsCommon.raxml_previous_tree_name( base_filename_without_ext, base_filename, current_time, i ) current_tree_name = GubbinsCommon.raxml_current_tree_name( base_filename_without_ext, current_time, i ) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif self.args.tree_builder == "raxml": previous_tree_name = GubbinsCommon.raxml_previous_tree_name( base_filename_without_ext, base_filename, current_time, i ) current_tree_name = GubbinsCommon.raxml_current_tree_name(base_filename_without_ext, current_time, i) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif self.args.tree_builder == "fasttree": previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i) if i == 1: previous_tree_name = base_filename current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i) tree_building_command = GubbinsCommon.fasttree_tree_building_command( i, self.args.starting_tree, current_tree_name, previous_tree_name, previous_tree_name, FASTTREE_EXEC, FASTTREE_PARAMS, base_filename, ) gubbins_command = GubbinsCommon.fasttree_gubbins_command( base_filename, starting_base_filename + ".gaps", i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) if self.args.verbose > 0: print(tree_building_command) if self.args.starting_tree is not None and i == 1: shutil.copyfile(self.args.starting_tree, current_tree_name) else: try: subprocess.check_call(tree_building_command, shell=True) except: sys.exit("Failed while building the tree.") if self.args.verbose > 0: print(int(time.time())) GubbinsCommon.reroot_tree(str(current_tree_name), self.args.outgroup) try: raxml_seq_recon = RAxMLSequenceReconstruction( starting_base_filename + ".snp_sites.aln", current_tree_name, starting_base_filename + ".seq.joint.txt", current_tree_name, raxml_executable_obj.internal_sequence_reconstruction_command(), self.args.verbose, ) raxml_seq_recon.reconstruct_ancestor_sequences() except: sys.exit("Failed while running RAxML internal sequence reconstruction") shutil.copyfile(starting_base_filename + ".start", starting_base_filename + ".gaps.snp_sites.aln") GubbinsCommon.reinsert_gaps_into_fasta_file( starting_base_filename + ".seq.joint.txt", starting_base_filename + ".gaps.vcf", starting_base_filename + ".gaps.snp_sites.aln", ) if ( GubbinsCommon.does_file_exist(starting_base_filename + ".gaps.snp_sites.aln", "Alignment File") == 0 or not ValidateFastaAlignment( starting_base_filename + ".gaps.snp_sites.aln" ).is_input_fasta_file_valid() ): sys.exit( "There is a problem with your FASTA file after running RAxML internal sequence reconstruction. Please check this intermediate file is valid: " + str(starting_base_filename) + ".gaps.snp_sites.aln" ) if self.args.verbose > 0: print(int(time.time())) if self.args.verbose > 0: print(gubbins_command) try: subprocess.check_call(gubbins_command, shell=True) except: sys.exit("Failed while running Gubbins. Please ensure you have enough free memory") if self.args.verbose > 0: print(int(time.time())) tree_file_names.append(current_tree_name) if i > 2: if self.args.converge_method == "recombination": current_recomb_file, previous_recomb_files = GubbinsCommon.get_recombination_files( base_filename_without_ext, current_time, max_iteration - 1, starting_base_filename, self.args.tree_builder, ) if GubbinsCommon.have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files): if self.args.verbose > 0: print("Recombinations observed before so stopping: " + str(current_tree_name)) break else: if GubbinsCommon.has_tree_been_seen_before(tree_file_names, self.args.converge_method): if self.args.verbose > 0: print("Tree observed before so stopping: " + str(current_tree_name)) break # cleanup intermediate files if self.args.no_cleanup == 0 or self.args.no_cleanup is None: max_intermediate_iteration = max_iteration - 1 raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions( base_filename_without_ext, current_time, starting_base_filename, max_intermediate_iteration ) GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) fasttree_files_to_delete = GubbinsCommon.fasttree_regex_for_file_deletions( starting_base_filename, max_intermediate_iteration ) GubbinsCommon.delete_files_based_on_list_of_regexes(".", fasttree_files_to_delete, self.args.verbose) shutil.rmtree(temp_working_dir) GubbinsCommon.delete_files_based_on_list_of_regexes( ".", [GubbinsCommon.starting_files_regex("^" + starting_base_filename), "^log.txt"], self.args.verbose ) output_filenames_to_final_filenames = {} if self.args.prefix is None: self.args.prefix = base_filename_without_ext if self.args.tree_builder == "hybrid" or self.args.tree_builder == "raxml": output_filenames_to_final_filenames = GubbinsCommon.translation_of_raxml_filenames_to_final_filenames( base_filename_without_ext, current_time, max_iteration - 1, self.args.prefix ) else: output_filenames_to_final_filenames = GubbinsCommon.translation_of_fasttree_filenames_to_final_filenames( starting_base_filename, max_iteration - 1, self.args.prefix ) GubbinsCommon.rename_files(output_filenames_to_final_filenames) GubbinsCommon.remove_internal_node_labels_from_tree( str(self.args.prefix) + ".final_tree.tre", str(self.args.prefix) + ".no_internal_labels.final_tree.tre" ) shutil.move( str(self.args.prefix) + ".no_internal_labels.final_tree.tre", str(self.args.prefix) + ".final_tree.tre" )
def test_filter_out_alignments_with_too_much_missing_data(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5) preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln')) self.cleanup()
def parse_and_run(input_args, program_description=""): """Main function of the Gubbins program""" start_time = time.time() current_directory = os.getcwd() printer = utils.VerbosePrinter(True, "\n") # Check if the Gubbins C-program is available. If so, print a welcome message. Otherwise exit. gubbins_exec = 'gubbins' if utils.which(gubbins_exec) is None: # Check if the Gubbins C-program is available in its source directory (for tests/Travis) gubbins_bundled_exec = os.path.abspath(os.path.join(current_directory, '../src/gubbins')) if utils.which(gubbins_bundled_exec) is None: sys.exit(gubbins_exec + " is not in your path") else: gubbins_exec = utils.replace_executable(gubbins_exec, gubbins_bundled_exec) program_version = "" try: program_version = str(pkg_resources.get_distribution(gubbins_exec).version) except pkg_resources.RequirementParseError: pass printer.print(["\n--- Gubbins " + program_version + " ---\n", program_description]) # Initialize tree builder and ancestral sequence reconstructor; check if all required dependencies are available printer.print("\nChecking dependencies...") current_tree_name = input_args.starting_tree tree_file_names = [] internal_node_label_prefix = "internal_" if input_args.tree_builder == "fasttree" or input_args.tree_builder == "hybrid": tree_builder = FastTree(input_args.verbose) sequence_reconstructor = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose) alignment_suffix = ".snp_sites.aln" elif input_args.tree_builder == "raxml": tree_builder = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose) sequence_reconstructor = tree_builder alignment_suffix = ".phylip" else: tree_builder = IQTree(input_args.threads, internal_node_label_prefix, input_args.verbose) sequence_reconstructor = tree_builder alignment_suffix = ".phylip" printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Check if the input files exist and have the right format printer.print("\nChecking input files...") if not os.path.exists(input_args.alignment_filename) \ or not ValidateFastaAlignment(input_args.alignment_filename).is_input_fasta_file_valid(): sys.exit("There input alignment file does not exist or has an invalid format") if input_args.starting_tree is not None and input_args.starting_tree != "" \ and (not os.path.exists(input_args.starting_tree) or not is_starting_tree_valid(input_args.starting_tree)): sys.exit("The starting tree does not exist or has an invalid format") if input_args.starting_tree is not None and input_args.starting_tree != "" \ and not do_the_names_match_the_fasta_file(input_args.starting_tree, input_args.alignment_filename): sys.exit("The names in the starting tree do not match the names in the alignment file") if number_of_sequences_in_alignment(input_args.alignment_filename) < 3: sys.exit("3 or more sequences are required.") # Check - and potentially correct - further input parameters check_and_fix_window_size(input_args) # Get the base filename (base_directory, base_filename) = os.path.split(input_args.alignment_filename) (basename, extension) = os.path.splitext(base_filename) if input_args.use_time_stamp: time_stamp = str(int(time.time())) basename = basename + "." + time_stamp snp_alignment_filename = base_filename + ".snp_sites.aln" gaps_alignment_filename = base_filename + ".gaps.snp_sites.aln" gaps_vcf_filename = base_filename + ".gaps.vcf" joint_sequences_filename = base_filename + ".seq.joint.aln" # Check if intermediate files from a previous run exist intermediate_files = [basename + ".iteration_"] if not input_args.no_cleanup: utils.delete_files(".", intermediate_files, "", input_args.verbose) if utils.do_files_exist(".", intermediate_files, "", input_args.verbose): sys.exit("Intermediate files from a previous run exist. Please rerun without the --no_cleanup option " "to automatically delete them or with the --use_time_stamp to add a unique prefix.") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Filter the input alignment and save as temporary alignment file printer.print("\nFiltering input alignment...") temp_working_dir = tempfile.mkdtemp(dir=os.getcwd()) temp_alignment_filename = temp_working_dir + "/" + base_filename pre_process_fasta = PreProcessFasta(input_args.alignment_filename, input_args.verbose, input_args.filter_percentage) taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( temp_alignment_filename, input_args.remove_identical_sequences) input_args.alignment_filename = temp_alignment_filename # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from it if input_args.starting_tree: (tree_base_directory, tree_base_filename) = os.path.split(input_args.starting_tree) temp_starting_tree = temp_working_dir + '/' + tree_base_filename filter_out_removed_taxa_from_tree(input_args.starting_tree, temp_starting_tree, taxa_removed) input_args.starting_tree = temp_starting_tree printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Find all SNP sites with Gubbins gubbins_command = " ".join([gubbins_exec, input_args.alignment_filename]) printer.print(["\nRunning Gubbins to detect SNPs...", gubbins_command]) try: subprocess.check_call(gubbins_command, shell=True) except subprocess.SubprocessError: sys.exit("Gubbins crashed, please ensure you have enough free memory") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) reconvert_fasta_file(snp_alignment_filename, snp_alignment_filename) reconvert_fasta_file(gaps_alignment_filename, base_filename + ".start") # Start the main loop printer.print("\nEntering the main loop.") for i in range(1, input_args.iterations+1): printer.print("\n*** Iteration " + str(i) + " ***") # 1.1. Construct the tree-building command depending on the iteration and employed options if i == 2 and input_args.tree_builder == "hybrid": # Switch to RAxML tree_builder = sequence_reconstructor alignment_suffix = ".phylip" if i == 1: previous_tree_name = input_args.starting_tree alignment_filename = base_filename + alignment_suffix else: previous_tree_name = current_tree_name alignment_filename = previous_tree_name + alignment_suffix current_basename = basename + ".iteration_" + str(i) current_tree_name = current_basename + ".tre" if previous_tree_name: tree_building_command = tree_builder.tree_building_command( os.path.abspath(alignment_filename), os.path.abspath(previous_tree_name), current_basename) else: tree_building_command = tree_builder.tree_building_command( os.path.abspath(alignment_filename), "", current_basename) built_tree = temp_working_dir + "/" + tree_builder.tree_prefix + current_basename + tree_builder.tree_suffix # 1.2. Construct the phylogenetic tree if input_args.starting_tree is not None and i == 1: printer.print("\nCopying the starting tree...") shutil.copyfile(input_args.starting_tree, current_tree_name) else: printer.print(["\nConstructing the phylogenetic tree with " + tree_builder.executable + "...", tree_building_command]) os.chdir(temp_working_dir) try: subprocess.check_call(tree_building_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while building the tree.") os.chdir(current_directory) shutil.copyfile(built_tree, current_tree_name) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 2. Re-root the tree reroot_tree(str(current_tree_name), input_args.outgroup) temp_rooted_tree = temp_working_dir + "/" + current_tree_name + ".rooted" if input_args.tree_builder == "iqtree": shutil.copyfile(current_tree_name, temp_rooted_tree) else: root_tree(current_tree_name, temp_rooted_tree) # 3.1. Construct the command for ancestral state reconstruction depending on the iteration and employed options ancestral_sequence_basename = current_basename + ".internal" sequence_reconstruction_command = sequence_reconstructor.internal_sequence_reconstruction_command( os.path.abspath(base_filename + alignment_suffix), os.path.abspath(temp_rooted_tree), ancestral_sequence_basename) raw_internal_sequence_filename \ = temp_working_dir + "/" + sequence_reconstructor.asr_prefix \ + ancestral_sequence_basename + sequence_reconstructor.asr_suffix processed_internal_sequence_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".aln" raw_internal_rooted_tree_filename \ = temp_working_dir + "/" + sequence_reconstructor.asr_tree_prefix \ + ancestral_sequence_basename + sequence_reconstructor.asr_tree_suffix # 3.2. Reconstruct the ancestral sequence printer.print(["\nReconstructing ancestral sequences with " + sequence_reconstructor.executable + "...", sequence_reconstruction_command]) os.chdir(temp_working_dir) try: subprocess.check_call(sequence_reconstruction_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while reconstructing the ancestral sequences.") os.chdir(current_directory) # 3.3. Join ancestral sequences with given sequences current_tree_name_with_internal_nodes = current_tree_name + ".internal" sequence_reconstructor.convert_raw_ancestral_states_to_fasta(raw_internal_sequence_filename, processed_internal_sequence_filename) concatenate_fasta_files([snp_alignment_filename, processed_internal_sequence_filename], joint_sequences_filename) transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree, current_tree_name_with_internal_nodes, sequence_reconstructor) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 4. Reinsert gaps (cp15 note: something is wonky here, the process is at the very least terribly inefficient) printer.print("\nReinserting gaps into the alignment...") shutil.copyfile(base_filename + ".start", gaps_alignment_filename) reinsert_gaps_into_fasta_file(joint_sequences_filename, gaps_vcf_filename, gaps_alignment_filename) if not os.path.exists(gaps_alignment_filename) \ or not ValidateFastaAlignment(gaps_alignment_filename).is_input_fasta_file_valid(): sys.exit("There is a problem with your FASTA file after running internal sequence reconstruction. " "Please check this intermediate file is valid: " + gaps_alignment_filename) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 5. Detect recombination sites with Gubbins (cp15 note: copy file with internal nodes back and forth to # ensure all created files have the desired name structure and to avoid fiddling with the Gubbins C program) shutil.copyfile(current_tree_name_with_internal_nodes, current_tree_name) gubbins_command = create_gubbins_command( gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name, input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size) printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command]) try: subprocess.check_call(gubbins_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while running Gubbins. Please ensure you have enough free memory") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) shutil.copyfile(current_tree_name, current_tree_name_with_internal_nodes) # 6. Check for convergence printer.print("\nChecking for convergence...") remove_internal_node_labels_from_tree(current_tree_name_with_internal_nodes, current_tree_name) tree_file_names.append(current_tree_name) if i > 1: if input_args.converge_method == 'recombination': current_recomb_file, previous_recomb_files = get_recombination_files(tree_file_names) if have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files): printer.print("Convergence after " + str(i) + " iterations: Recombinations observed before.") break else: if has_tree_been_seen_before(tree_file_names, input_args.converge_method): printer.print("Convergence after " + str(i) + " iterations: Tree observed before.") break printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) else: printer.print("Maximum number of iterations (" + str(input_args.iterations) + ") reached.") printer.print("\nExiting the main loop.") # Create the final output printer.print("\nCreating the final output...") if input_args.prefix is None: input_args.prefix = basename output_filenames_to_final_filenames = translation_of_filenames_to_final_filenames( current_tree_name, input_args.prefix) utils.rename_files(output_filenames_to_final_filenames) # Cleanup intermediate files if not input_args.no_cleanup: shutil.rmtree(temp_working_dir) utils.delete_files(".", tree_file_names[:-1], intermediate_files_regex(), input_args.verbose) utils.delete_files(".", [base_filename], starting_files_regex(), input_args.verbose) printer.print("...finished. Total run time: {:.2f} s".format(time.time() - start_time))