Exemplo n.º 1
0
    def test_are_all_sequence_names_unique(self):
        validate_fasta = ValidateFastaAlignment(
            'gubbins/tests/data/all_unique_sequence_names.fa')
        self.assertTrue(validate_fasta.are_sequence_names_unique())
        self.assertTrue(validate_fasta.is_input_fasta_file_valid())

        validate_fasta = ValidateFastaAlignment(
            'gubbins/tests/data/non_unique_sequence_names.fa')
        self.assertFalse(validate_fasta.are_sequence_names_unique())
        self.assertFalse(validate_fasta.is_input_fasta_file_valid())
    def test_are_all_sequence_names_unique(self):
        validate_fasta = ValidateFastaAlignment(
            os.path.join(data_dir, 'all_unique_sequence_names.fa'))
        self.assertTrue(validate_fasta.are_sequence_names_unique())
        self.assertTrue(validate_fasta.is_input_fasta_file_valid())

        validate_fasta = ValidateFastaAlignment(
            os.path.join(data_dir, 'non_unique_sequence_names.fa'))
        self.assertFalse(validate_fasta.are_sequence_names_unique())
        self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 3
0
    def test_are_all_sequences_the_same_length(self):
        validate_fasta = ValidateFastaAlignment(
            'gubbins/tests/data/valid_alignment.aln')
        self.assertTrue(
            validate_fasta.does_each_sequence_have_the_same_length())
        self.assertTrue(validate_fasta.is_input_fasta_file_valid())

        validate_fasta = ValidateFastaAlignment(
            'gubbins/tests/data/sequences_of_different_lengths.fa')
        self.assertFalse(
            validate_fasta.does_each_sequence_have_the_same_length())
        self.assertFalse(validate_fasta.is_input_fasta_file_valid())
    def test_are_all_sequences_the_same_length(self):
        validate_fasta = ValidateFastaAlignment(
            os.path.join(data_dir, 'valid_alignment.aln'))
        self.assertTrue(
            validate_fasta.does_each_sequence_have_the_same_length())
        self.assertTrue(validate_fasta.is_input_fasta_file_valid())

        validate_fasta = ValidateFastaAlignment(
            os.path.join(data_dir, 'sequences_of_different_lengths.fa'))
        self.assertFalse(
            validate_fasta.does_each_sequence_have_the_same_length())
        self.assertFalse(validate_fasta.is_input_fasta_file_valid())
 def test_are_all_sequence_names_unique(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/all_unique_sequence_names.fa')
     self.assertTrue(validate_fasta.are_sequence_names_unique())
     self.assertTrue(validate_fasta.is_input_fasta_file_valid())
     
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/non_unique_sequence_names.fa')
     self.assertFalse(validate_fasta.are_sequence_names_unique())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
 def test_are_all_sequences_the_same_length(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/valid_alignment.aln')
     self.assertTrue(validate_fasta.does_each_sequence_have_the_same_length())
     self.assertTrue(validate_fasta.is_input_fasta_file_valid())
     
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequences_of_different_lengths.fa')
     self.assertFalse(validate_fasta.does_each_sequence_have_the_same_length())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 7
0
 def test_does_the_sequence_have_sensible_characters(self):
     validate_fasta = ValidateFastaAlignment(
         'gubbins/tests/data/sequence_with_odd_chars.fa')
     self.assertFalse(
         validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 8
0
 def test_does_each_sequence_have_a_name(self):
     validate_fasta = ValidateFastaAlignment(
         'gubbins/tests/data/sequence_without_a_name.fa')
     self.assertFalse(
         validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 9
0
 def test_valid_fasta_file(self):
     validate_fasta = ValidateFastaAlignment(
         'gubbins/tests/data/multiple_recombinations.aln')
     self.assertTrue(validate_fasta.is_input_fasta_file_valid())
 def test_does_each_sequence_have_a_name(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequence_without_a_name.fa')
     self.assertFalse(validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 11
0
 def test_valid_fasta_file(self):
     validate_fasta = ValidateFastaAlignment(
         os.path.join(data_dir, 'multiple_recombinations.aln'))
     self.assertTrue(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 12
0
def parse_and_run(input_args, program_description=""):
    """Main function of the Gubbins program"""
    start_time = time.time()
    current_directory = os.getcwd()
    printer = utils.VerbosePrinter(True, "\n")

    # Check if the Gubbins C-program is available. If so, print a welcome message. Otherwise exit.
    gubbins_exec = 'gubbins'
    if utils.which(gubbins_exec) is None:
        # Check if the Gubbins C-program is available in its source directory (for tests/Travis)
        gubbins_bundled_exec = os.path.abspath(os.path.join(current_directory, '../src/gubbins'))
        if utils.which(gubbins_bundled_exec) is None:
            sys.exit(gubbins_exec + " is not in your path")
        else:
            gubbins_exec = utils.replace_executable(gubbins_exec, gubbins_bundled_exec)
    program_version = ""
    try:
        program_version = str(pkg_resources.get_distribution(gubbins_exec).version)
    except pkg_resources.RequirementParseError:
        pass
    printer.print(["\n--- Gubbins " + program_version + " ---\n", program_description])

    # Initialize tree builder and ancestral sequence reconstructor; check if all required dependencies are available
    printer.print("\nChecking dependencies...")
    current_tree_name = input_args.starting_tree
    tree_file_names = []
    internal_node_label_prefix = "internal_"
    if input_args.tree_builder == "fasttree" or input_args.tree_builder == "hybrid":
        tree_builder = FastTree(input_args.verbose)
        sequence_reconstructor = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix,
                                       input_args.verbose)
        alignment_suffix = ".snp_sites.aln"
    elif input_args.tree_builder == "raxml":
        tree_builder = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose)
        sequence_reconstructor = tree_builder
        alignment_suffix = ".phylip"
    else:
        tree_builder = IQTree(input_args.threads, internal_node_label_prefix, input_args.verbose)
        sequence_reconstructor = tree_builder
        alignment_suffix = ".phylip"
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Check if the input files exist and have the right format
    printer.print("\nChecking input files...")
    if not os.path.exists(input_args.alignment_filename) \
            or not ValidateFastaAlignment(input_args.alignment_filename).is_input_fasta_file_valid():
        sys.exit("There input alignment file does not exist or has an invalid format")
    if input_args.starting_tree is not None and input_args.starting_tree != "" \
            and (not os.path.exists(input_args.starting_tree) or not is_starting_tree_valid(input_args.starting_tree)):
        sys.exit("The starting tree does not exist or has an invalid format")
    if input_args.starting_tree is not None and input_args.starting_tree != "" \
            and not do_the_names_match_the_fasta_file(input_args.starting_tree, input_args.alignment_filename):
        sys.exit("The names in the starting tree do not match the names in the alignment file")
    if number_of_sequences_in_alignment(input_args.alignment_filename) < 3:
        sys.exit("3 or more sequences are required.")

    # Check - and potentially correct - further input parameters
    check_and_fix_window_size(input_args)

    # Get the base filename
    (base_directory, base_filename) = os.path.split(input_args.alignment_filename)
    (basename, extension) = os.path.splitext(base_filename)
    if input_args.use_time_stamp:
        time_stamp = str(int(time.time()))
        basename = basename + "." + time_stamp
    snp_alignment_filename = base_filename + ".snp_sites.aln"
    gaps_alignment_filename = base_filename + ".gaps.snp_sites.aln"
    gaps_vcf_filename = base_filename + ".gaps.vcf"
    joint_sequences_filename = base_filename + ".seq.joint.aln"

    # Check if intermediate files from a previous run exist
    intermediate_files = [basename + ".iteration_"]
    if not input_args.no_cleanup:
        utils.delete_files(".", intermediate_files, "", input_args.verbose)
    if utils.do_files_exist(".", intermediate_files, "", input_args.verbose):
        sys.exit("Intermediate files from a previous run exist. Please rerun without the --no_cleanup option "
                 "to automatically delete them or with the --use_time_stamp to add a unique prefix.")
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Filter the input alignment and save as temporary alignment file
    printer.print("\nFiltering input alignment...")
    temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
    temp_alignment_filename = temp_working_dir + "/" + base_filename

    pre_process_fasta = PreProcessFasta(input_args.alignment_filename, input_args.verbose,
                                        input_args.filter_percentage)
    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
        temp_alignment_filename, input_args.remove_identical_sequences)
    input_args.alignment_filename = temp_alignment_filename

    # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from it
    if input_args.starting_tree:
        (tree_base_directory, tree_base_filename) = os.path.split(input_args.starting_tree)
        temp_starting_tree = temp_working_dir + '/' + tree_base_filename
        filter_out_removed_taxa_from_tree(input_args.starting_tree, temp_starting_tree, taxa_removed)
        input_args.starting_tree = temp_starting_tree
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

    # Find all SNP sites with Gubbins
    gubbins_command = " ".join([gubbins_exec, input_args.alignment_filename])
    printer.print(["\nRunning Gubbins to detect SNPs...", gubbins_command])
    try:
        subprocess.check_call(gubbins_command, shell=True)
    except subprocess.SubprocessError:
        sys.exit("Gubbins crashed, please ensure you have enough free memory")
    printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
    reconvert_fasta_file(snp_alignment_filename, snp_alignment_filename)
    reconvert_fasta_file(gaps_alignment_filename, base_filename + ".start")

    # Start the main loop
    printer.print("\nEntering the main loop.")
    for i in range(1, input_args.iterations+1):
        printer.print("\n*** Iteration " + str(i) + " ***")

        # 1.1. Construct the tree-building command depending on the iteration and employed options
        if i == 2 and input_args.tree_builder == "hybrid":
            # Switch to RAxML
            tree_builder = sequence_reconstructor
            alignment_suffix = ".phylip"

        if i == 1:
            previous_tree_name = input_args.starting_tree
            alignment_filename = base_filename + alignment_suffix
        else:
            previous_tree_name = current_tree_name
            alignment_filename = previous_tree_name + alignment_suffix

        current_basename = basename + ".iteration_" + str(i)
        current_tree_name = current_basename + ".tre"
        if previous_tree_name:
            tree_building_command = tree_builder.tree_building_command(
                os.path.abspath(alignment_filename), os.path.abspath(previous_tree_name), current_basename)
        else:
            tree_building_command = tree_builder.tree_building_command(
                os.path.abspath(alignment_filename), "", current_basename)
        built_tree = temp_working_dir + "/" + tree_builder.tree_prefix + current_basename + tree_builder.tree_suffix

        # 1.2. Construct the phylogenetic tree
        if input_args.starting_tree is not None and i == 1:
            printer.print("\nCopying the starting tree...")
            shutil.copyfile(input_args.starting_tree, current_tree_name)
        else:
            printer.print(["\nConstructing the phylogenetic tree with " + tree_builder.executable + "...",
                           tree_building_command])
            os.chdir(temp_working_dir)
            try:
                subprocess.check_call(tree_building_command, shell=True)
            except subprocess.SubprocessError:
                sys.exit("Failed while building the tree.")
            os.chdir(current_directory)
            shutil.copyfile(built_tree, current_tree_name)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 2. Re-root the tree
        reroot_tree(str(current_tree_name), input_args.outgroup)
        temp_rooted_tree = temp_working_dir + "/" + current_tree_name + ".rooted"
        if input_args.tree_builder == "iqtree":
            shutil.copyfile(current_tree_name, temp_rooted_tree)
        else:
            root_tree(current_tree_name, temp_rooted_tree)

        # 3.1. Construct the command for ancestral state reconstruction depending on the iteration and employed options
        ancestral_sequence_basename = current_basename + ".internal"
        sequence_reconstruction_command = sequence_reconstructor.internal_sequence_reconstruction_command(
            os.path.abspath(base_filename + alignment_suffix), os.path.abspath(temp_rooted_tree),
            ancestral_sequence_basename)
        raw_internal_sequence_filename \
            = temp_working_dir + "/" + sequence_reconstructor.asr_prefix \
            + ancestral_sequence_basename + sequence_reconstructor.asr_suffix
        processed_internal_sequence_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".aln"
        raw_internal_rooted_tree_filename \
            = temp_working_dir + "/" + sequence_reconstructor.asr_tree_prefix \
            + ancestral_sequence_basename + sequence_reconstructor.asr_tree_suffix

        # 3.2. Reconstruct the ancestral sequence
        printer.print(["\nReconstructing ancestral sequences with " + sequence_reconstructor.executable + "...",
                       sequence_reconstruction_command])
        os.chdir(temp_working_dir)
        try:
            subprocess.check_call(sequence_reconstruction_command, shell=True)
        except subprocess.SubprocessError:
            sys.exit("Failed while reconstructing the ancestral sequences.")
        os.chdir(current_directory)

        # 3.3. Join ancestral sequences with given sequences
        current_tree_name_with_internal_nodes = current_tree_name + ".internal"
        sequence_reconstructor.convert_raw_ancestral_states_to_fasta(raw_internal_sequence_filename,
                                                                     processed_internal_sequence_filename)
        concatenate_fasta_files([snp_alignment_filename, processed_internal_sequence_filename],
                                joint_sequences_filename)
        transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree,
                                              current_tree_name_with_internal_nodes, sequence_reconstructor)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 4. Reinsert gaps (cp15 note: something is wonky here, the process is at the very least terribly inefficient)
        printer.print("\nReinserting gaps into the alignment...")
        shutil.copyfile(base_filename + ".start", gaps_alignment_filename)
        reinsert_gaps_into_fasta_file(joint_sequences_filename, gaps_vcf_filename, gaps_alignment_filename)
        if not os.path.exists(gaps_alignment_filename) \
                or not ValidateFastaAlignment(gaps_alignment_filename).is_input_fasta_file_valid():
            sys.exit("There is a problem with your FASTA file after running internal sequence reconstruction. "
                     "Please check this intermediate file is valid: " + gaps_alignment_filename)
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))

        # 5. Detect recombination sites with Gubbins (cp15 note: copy file with internal nodes back and forth to
        # ensure all created files have the desired name structure and to avoid fiddling with the Gubbins C program)
        shutil.copyfile(current_tree_name_with_internal_nodes, current_tree_name)
        gubbins_command = create_gubbins_command(
            gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name,
            input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size)
        printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command])
        try:
            subprocess.check_call(gubbins_command, shell=True)
        except subprocess.SubprocessError:
            sys.exit("Failed while running Gubbins. Please ensure you have enough free memory")
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
        shutil.copyfile(current_tree_name, current_tree_name_with_internal_nodes)

        # 6. Check for convergence
        printer.print("\nChecking for convergence...")
        remove_internal_node_labels_from_tree(current_tree_name_with_internal_nodes, current_tree_name)
        tree_file_names.append(current_tree_name)
        if i > 1:
            if input_args.converge_method == 'recombination':
                current_recomb_file, previous_recomb_files = get_recombination_files(tree_file_names)
                if have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files):
                    printer.print("Convergence after " + str(i) + " iterations: Recombinations observed before.")
                    break
            else:
                if has_tree_been_seen_before(tree_file_names, input_args.converge_method):
                    printer.print("Convergence after " + str(i) + " iterations: Tree observed before.")
                    break
        printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time))
    else:
        printer.print("Maximum number of iterations (" + str(input_args.iterations) + ") reached.")
    printer.print("\nExiting the main loop.")

    # Create the final output
    printer.print("\nCreating the final output...")
    if input_args.prefix is None:
        input_args.prefix = basename
    output_filenames_to_final_filenames = translation_of_filenames_to_final_filenames(
        current_tree_name, input_args.prefix)
    utils.rename_files(output_filenames_to_final_filenames)

    # Cleanup intermediate files
    if not input_args.no_cleanup:
        shutil.rmtree(temp_working_dir)
        utils.delete_files(".", tree_file_names[:-1], intermediate_files_regex(), input_args.verbose)
        utils.delete_files(".", [base_filename], starting_files_regex(), input_args.verbose)
    printer.print("...finished. Total run time: {:.2f} s".format(time.time() - start_time))
 def test_are_there_enough_sequences_to_build_a_tree(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/alignment_with_3_sequences.aln')
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
 def test_does_the_sequence_have_sensible_characters(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/sequence_with_odd_chars.fa')
     self.assertFalse(validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 15
0
 def test_does_each_sequence_have_a_name(self):
     validate_fasta = ValidateFastaAlignment(
         os.path.join(data_dir, 'sequence_without_a_name.fa'))
     self.assertFalse(
         validate_fasta.does_each_sequence_have_a_name_and_genomic_data())
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
Exemplo n.º 16
0
 def test_are_there_enough_sequences_to_build_a_tree(self):
     validate_fasta = ValidateFastaAlignment(
         'gubbins/tests/data/alignment_with_3_sequences.aln')
     self.assertFalse(validate_fasta.is_input_fasta_file_valid())
 def test_valid_fasta_file(self):
     validate_fasta = ValidateFastaAlignment('gubbins/tests/data/multiple_recombinations.aln')
     self.assertTrue(validate_fasta.is_input_fasta_file_valid())