Exemplo n.º 1
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Exemplo n.º 2
0
    def split_proteins_per_species(dir_with_proteins,
                                   output_dir,
                                   input_format="fasta",
                                   output_format="fasta"):
        input_files = FileRoutines.make_list_of_path_to_files(
            [dir_with_proteins] if isinstance(dir_with_proteins, str
                                              ) else dir_with_proteins)

        out_dir = FileRoutines.check_path(output_dir)
        FileRoutines.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx",
                                      input_files,
                                      format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id),
                        out_file,
                        format=output_format)
Exemplo n.º 3
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import MultipleAlignmentRoutines, FileRoutines

parser = argparse.ArgumentParser()

parser.add_argument(
    "-i",
    "--input",
    action="store",
    dest="input",
    required=True,
    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
    help="Comma-separated list of files/directories with alignments")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write merged alignment")
parser.add_argument(
    "-c",
    "--coordinates_file",
    action="store",
    dest="coords_file",
    required=True,
    help="File to write file with coordinates of alignments in merged alignment"
)
Exemplo n.º 4
0
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write clusters with single-copy clusters")
parser.add_argument(
    "-p",
    "--label position",
    action="store",
    dest="label_position",
    default="first",
    help="Position of label. Allowed - first, last. Default - first")
parser.add_argument("-s",
                    "--separator",
                    action="store",
                    dest="separator",
                    default="@",
                    help="Separator to use. default - '@'")

args = parser.parse_args()

list_of_cluster_files = FileRoutines.make_list_of_path_to_files(args.input)

single_copy_clusters = SequenceClusterRoutines.extract_single_copy_clusters_from_files(
    list_of_cluster_files,
    args.output,
    label_elements=args.label,
    separator=args.separator,
    label_position=args.label_position)

print "Was found %i single-copy clusters" % len(single_copy_clusters)
Exemplo n.º 5
0
    def parallel_positive_selection_test(self,
                                         in_dir,
                                         tree_file,
                                         out_dir,
                                         results_file,
                                         seq_type="codons",
                                         codon_frequency="F3X4",
                                         noisy=3,
                                         verbose="concise",
                                         runmode=0,
                                         clock=0,
                                         aminoacid_distance=None,
                                         genetic_code=0,
                                         fix_kappa=False,
                                         kappa=5,
                                         getSE=0,
                                         RateAncestor=0,
                                         small_difference=0.000001,
                                         clean_data=True,
                                         method=0):
        """
        This function implements positive selection test (branch-site model)
        for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison
        """

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        basename_dir_list = []
        model_list = ["Model_A", "Model_A_null"]
        fix_omega_dict = {"Model_A": False, "Model_A_null": True}

        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            basename_dir_list.append(basename)
            FileRoutines.safe_mkdir(filename_out_dir)

            for model in model_list:
                model_dir = "%s/%s/" % (filename_out_dir, model)
                FileRoutines.safe_mkdir(model_dir)
                out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename)
                ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename)

                options_list.append("%s.ctl" % basename)
                dir_list.append(model_dir)

                self.generate_ctl_file(os.path.abspath(filename),
                                       tree_file_abs_path,
                                       out_file,
                                       ctl_file,
                                       seq_type=seq_type,
                                       codon_frequency=codon_frequency,
                                       noisy=noisy,
                                       verbose=verbose,
                                       runmode=runmode,
                                       clock=clock,
                                       aminoacid_distance=aminoacid_distance,
                                       model=2,
                                       nssites=2,
                                       genetic_code=genetic_code,
                                       fix_kappa=fix_kappa,
                                       kappa=kappa,
                                       fix_omega=fix_omega_dict[model],
                                       omega=1,
                                       getSE=getSE,
                                       RateAncestor=RateAncestor,
                                       Mgene=0,
                                       small_difference=small_difference,
                                       clean_data=clean_data,
                                       method=method)

        self.parallel_execute(options_list, dir_list=dir_list)

        results_dict = OrderedDict()
        double_delta_dict = OrderedDict()
        raw_pvalues_dict = OrderedDict()
        raw_pvalues_list = []

        for basename in basename_dir_list:
            results_dict[basename] = OrderedDict()
            for model in model_list:
                output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model,
                                                   basename)
                codeml_report = CodeMLReport(output_file)
                results_dict[basename][model] = codeml_report.LnL

        skipped_genes_set = set()
        for basename in basename_dir_list:
            for model in model_list:
                if results_dict[basename][model] is None:
                    print("LnL was not calculated for %s" % basename)
                    skipped_genes_set.add(basename)
                    break
            else:
                doubled_delta = 2 * (results_dict[basename]["Model_A"] -
                                     results_dict[basename]["Model_A_null"])
                p_value = chisqprob(doubled_delta, 1)  # degrees of freedom = 1

                double_delta_dict[basename] = doubled_delta
                raw_pvalues_dict[basename] = p_value
                raw_pvalues_list.append(p_value)

        adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1]
        #print adjusted_pvalues_list
        i = 0
        with open(results_file, "w") as out_fd:
            out_fd.write(
                "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n"
            )
            for basename in basename_dir_list:
                for model in model_list:
                    if results_dict[basename][model] is None:
                        print("LnL was not calculated for %s" % basename)
                        break
                else:
                    #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"])
                    #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1

                    #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i]

                    out_fd.write(
                        "%s\t%f\t%f\t%f\t%f\t%f\n" %
                        (basename, results_dict[basename]["Model_A_null"],
                         results_dict[basename]["Model_A"],
                         double_delta_dict[basename],
                         raw_pvalues_dict[basename], adjusted_pvalues_list[i]))
                    i += 1
Exemplo n.º 6
0
    def parallel_codeml(self,
                        in_dir,
                        tree_file,
                        out_dir,
                        seq_type="codons",
                        codon_frequency="F3X4",
                        noisy=0,
                        verbose="concise",
                        runmode=0,
                        clock=0,
                        aminoacid_distance=None,
                        model=1,
                        nssites=0,
                        genetic_code=0,
                        fix_kappa=False,
                        kappa=5,
                        fix_omega=False,
                        omega=0.2,
                        getSE=0,
                        RateAncestor=0,
                        small_difference=0.000001,
                        clean_data=True,
                        method=0,
                        Mgene=None):

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            out_file = "%s/%s.out" % (filename_out_dir, basename)
            ctl_file = "%s/%s.ctl" % (filename_out_dir, basename)

            options_list.append(ctl_file)
            dir_list.append(filename_out_dir)
            FileRoutines.safe_mkdir(filename_out_dir)
            self.generate_ctl_file(os.path.abspath(filename),
                                   tree_file_abs_path,
                                   out_file,
                                   ctl_file,
                                   seq_type=seq_type,
                                   codon_frequency=codon_frequency,
                                   noisy=noisy,
                                   verbose=verbose,
                                   runmode=runmode,
                                   clock=clock,
                                   aminoacid_distance=aminoacid_distance,
                                   model=model,
                                   nssites=nssites,
                                   genetic_code=genetic_code,
                                   fix_kappa=fix_kappa,
                                   kappa=kappa,
                                   fix_omega=fix_omega,
                                   omega=omega,
                                   getSE=getSE,
                                   RateAncestor=RateAncestor,
                                   Mgene=Mgene,
                                   small_difference=small_difference,
                                   clean_data=clean_data,
                                   method=method)
        self.parallel_execute(options_list, dir_list=dir_list)
Exemplo n.º 7
0
    def mask(self,
             list_of_fasta_files,
             output_dir="./",
             soft_masking=True,
             engine="ncbi",
             slow_search=True,
             quick_search=False,
             rush_search=False,
             no_low_complexity=None,
             only_low_complexity=None,
             no_interspersed=None,
             only_interspersed=None,
             no_rna=None,
             only_alu=None,
             custom_library=None,
             species=None,
             html_output=False,
             ace_output=False,
             gff_output=False):

        if (slow_search and quick_search) or (
                rush_search and quick_search) or (slow_search and rush_search):
            raise ValueError(
                "Both quick search(-q) and slow search(-s) options were set. Choose ONE!"
            )

        if species and custom_library:
            tmp_repeat_file = "%s.repeats.tmp.fa" % species
            tmp_repeats_all_file = "all.repeats.tmp.fasta"
            self.extract_repeats_from_database(tmp_repeat_file,
                                               species=species)

            cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library,
                                      tmp_repeats_all_file)
            self.execute(cmd=cmd)

        options = " -pa %i" % self.threads
        options += " -e %s" % engine
        options += " -s" if slow_search else ""
        options += " -q" if quick_search else ""
        options += " -qq" if rush_search else ""
        options += " -nolow" if no_low_complexity else ""
        options += " -low" if only_low_complexity else ""
        options += " -noint" if no_interspersed else ""
        options += " -int" if only_interspersed else ""
        options += " -norna" if no_rna else ""
        options += " -alu" if only_alu else ""

        if species and custom_library:
            options += " -lib %s" % tmp_repeats_all_file
        elif custom_library:
            options += " -lib %s" % custom_library if custom_library else ""
        elif species:
            options += " -species %s" % species if species else ""

        options += " -dir %s" % output_dir
        options += " -html" if html_output else ""
        options += " -ace" if ace_output else ""
        options += " -gff" if gff_output else ""
        options += " -xsmall" if soft_masking else ""

        options += " " + (list_of_fasta_files if isinstance(
            list_of_fasta_files, str) else " ".join(
                FileRoutines.make_list_of_path_to_files(list_of_fasta_files)))

        self.execute(options=options)
        """
Exemplo n.º 8
0
    def mask(self,
             list_of_fasta_files,
             output_dir="./",
             soft_masking=True,
             engine="ncbi",
             search_speed="normal",
             no_low_complexity=None,
             only_low_complexity=None,
             no_interspersed=None,
             only_interspersed=None,
             no_rna=None,
             only_alu=None,
             custom_library=None,
             species=None,
             html_output=False,
             ace_output=False,
             gff_output=False):

        if species and custom_library:
            tmp_repeat_file = "%s/%s.repeats.tmp.fa" % (output_dir, species)
            tmp_repeats_all_file = "%s/all.repeats.tmp.fasta" % output_dir
            self.extract_repeats_from_database(tmp_repeat_file,
                                               species=species)

            cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library,
                                      tmp_repeats_all_file)
            self.execute(cmd=cmd)

        options = " -pa %i" % self.threads
        options += " -e %s" % engine

        if search_speed == "slow":
            options += " -s"
        elif search_speed == "quick":
            options += " -q"
        elif search_speed == "rush":
            options += " -qq"

        options += " -nolow" if no_low_complexity else ""
        options += " -low" if only_low_complexity else ""
        options += " -noint" if no_interspersed else ""
        options += " -int" if only_interspersed else ""
        options += " -norna" if no_rna else ""
        options += " -alu" if only_alu else ""

        if species and custom_library:
            options += " -lib %s" % tmp_repeats_all_file
        elif custom_library:
            options += " -lib %s" % custom_library if custom_library else ""
        elif species:
            options += " -species %s" % species if species else ""

        options += " -dir %s" % output_dir
        options += " -html" if html_output else ""
        options += " -ace" if ace_output else ""
        options += " -gff" if gff_output else ""
        options += " -xsmall" if soft_masking else ""

        options += " " + (list_of_fasta_files if isinstance(
            list_of_fasta_files, str) else " ".join(
                FileRoutines.make_list_of_path_to_files(list_of_fasta_files)))

        self.execute(options=options)
        """
Exemplo n.º 9
0
#!/usr/bin/env python
import os
from Bio import SeqIO

from Routines import FileRoutines

workdir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/"
data_dir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/data/"

os.chdir(workdir)

data_files = FileRoutines.make_list_of_path_to_files([data_dir])

record_dict = SeqIO.index_db("tmp.idx", data_files, format="genbank")

print("#organism\ttaxonomy\tregion_id\ttranscript_id\tproduct\texon_len")
for record_id in record_dict:
    for feature in record_dict[record_id].features:
        if feature.type == "mRNA":
            mRNA_string = ""
            mRNA_string += "%s" % record_dict[record_id].annotations["organism"]
            mRNA_string += "\t%s" % (";".join(
                record_dict[record_id].annotations["taxonomy"]))
            mRNA_string += "\t%s" % record_id
            mRNA_string += "\t%s" % (feature.qualifiers["transcript_id"][0]
                                     if "transcript_id" in feature.qualifiers
                                     else ".")
            mRNA_string += "\t%s" % (feature.qualifiers["product"][0] if
                                     "product" in feature.qualifiers else ".")

            location_lenths = []