Пример #1
0
 def __init__(self, cmd, path="", max_threads=4, jar_path=None, jar=None,
              max_memory="500m", timelog="tool_time.log"):
     SequenceRoutines.__init__(self)
     self.path = self.check_path(path)
     self.cmd = cmd
     self.threads = max_threads
     #print(jar_path)
     self.jar_path = self.check_path(jar_path) if jar_path else None
     self.jar = jar
     self.max_memory = max_memory
     self.timelog = timelog
Пример #2
0
    def __init__(self):
        SequenceRoutines.__init__(self)

        self.GFF_SCAFFOLD_COLUMN = 0
        self.GFF_SOURCE_COLUMN = 1
        self.GFF_FEATURETYPE_COLUMN = 2
        self.GFF_START_COLUMN = 3
        self.GFF_END_COLUMN = 4
        self.GFF_SCORE_COLUMN = 5
        self.GFF_STRAND_COLUMN = 6
        self.GFF_PHASE_COLUMN = 7
        self.GFF_ATTRIBUTE_COLUMN = 8

        self.BED_SCAFFOLD_COLUMN = 0
        self.BED_START_COLUMN = 1
        self.BED_END_COLUMN = 2
Пример #3
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Пример #4
0
 def __init__(self,
              cmd,
              path="",
              max_threads=4,
              jar_path="",
              jar=None,
              max_memory="500m",
              max_per_thread_memory="500m",
              timelog=None,
              tmp_dir=None):
     SequenceRoutines.__init__(self)
     self.path = self.check_path(path)
     self.cmd = cmd
     self.threads = max_threads
     #print(jar_path)
     self.jar_path = self.check_path(jar_path) if jar_path else ""
     self.jar = jar
     self.max_memory = max_memory
     self.timelog = timelog
     self.max_per_thread_memory = max_per_thread_memory
     self.tmp_dir = tmp_dir
Пример #5
0
from Routines.Matplotlib import MatplotlibRoutines
from Routines.Annotations import AnnotationsRoutines
from Routines.Phylogenetics import PhylogeneticsRoutines
from Routines.SequenceCluster import SequenceClusterRoutines
from Routines.MultipleAlignment import MultipleAlignmentRoutines

GORoutines = GORoutines()
VCFRoutines = VCFRoutines()
MathRoutines = MathRoutines()
FileRoutines = FileRoutines()
TreeRoutines = TreeRoutines()
NCBIRoutines = NCBIRoutines()
MtDNARoutines = MtDNARoutines()
FastQRoutines = FastQRoutines()
SmoothRoutines = SmoothRoutines()
PrimerRoutines = PrimerRoutines()
EggNOGRoutines = EggNOGRoutines()
ProjectRoutines = ProjectRoutines()
EnsemblRoutines = EnsemblRoutines()
TreeFamRoutines = TreeFamRoutines()
DrawingRoutines = DrawingRoutines()
SequenceRoutines = SequenceRoutines()
EvolutionRoutines = EvolutionRoutines()
AlignmentRoutines = AlignmentRoutines()
ExpressionRoutines = ExpressionRoutines()
MatplotlibRoutines = MatplotlibRoutines()
AnnotationsRoutines = AnnotationsRoutines()
PhylogeneticsRoutines = PhylogeneticsRoutines()
SequenceClusterRoutines = SequenceClusterRoutines()
MultipleAlignmentRoutines = MultipleAlignmentRoutines()
Пример #6
0
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="format of file with sequences - default: fasta.")
parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    help="file with sequences")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="out.t",
                    help="output file - default: out.t.")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

record_dict = SeqIO.index_db("temp_index.idx", [args.input],
                             format=args.format)
lengths_dict = SequenceRoutines.get_lengths(record_dict, out_file=out_fd)
print("Longest sequence: %i" % max(lengths_dict.values()))
print("Shortest sequence: %i" % min(lengths_dict.values()))
print("Total length: %i" % sum(lengths_dict.values()))
os.remove("temp_index.idx")
Пример #7
0
 def __init__(self):
     SequenceRoutines.__init__(self)
Пример #8
0
    def __init__(self):
        SequenceRoutines.__init__(self)
        self.mithochondrion_synonym_table = [
            [
                "12S_rRNA", "12S rRNA", "12S ribosomal RNA",
                "small subunit ribosomal RNA", "s-rRNA",
                "s-rRNA; 12S ribosomal RNA", "small ribosomal RNA subunit RNA",
                "12S ribosomal RNA", "12S ribosomal RNA subunit",
                "12S rivbosomal RNA", "12S ribosamal RNA", "l2S ribosomal RNA",
                "12 ribosomal RNA", "12S ribosormal RNA,"
                "12 rRNA", "s-RNA"
            ],
            [
                "16S_rRNA", "16S rRNA", "16S ribosomal RNA",
                "large subunit ribosomal RNA", "l-rRNA",
                "l-rRNA; 16S ribosomal RNA", "large ribosomal RNA subunit RNA",
                "16S ribosomal RNA", "16S ribosomal RNA subunit",
                "16S rivbosomal RNA", "16S ribosamal RNA", "l6S ribosomal RNA",
                "16 ribosomal RNA", "16S ribosormal RNA", "16 rRNA", "l-RNA"
            ],
            [
                "ATP6", "atp6", "ATPase6", "ATPase 6", "ATPase subunit 6",
                "ATP synthase F0 subunit 6", "ATP synthetase F0 subunit 6",
                "ATP synthase subunit 6"
                "ATPase subunits 6", "adenosine triphosphatase subunit 6",
                "ATPase subunit-6"
            ],
            [
                "ATP8", "atp8", "ATPase8", "ATPase 8", "ATPase subunit 8",
                "ATP synthase F0 subunit 8", "ATP synthetase F0 subunit 8",
                "ATP synthase subunit 8", "ATPase subunits 8",
                "adenosine triphosphatase subunit 8",
                "adenosine triphosphate subunit 8", "ATPase subunit-8"
            ],
            [
                "COX1", "COXI", "cytochrome c oxidase subunit 1",
                "cytochrome c oxidase subunit I",
                "Cytochrome c oxidase subunit 1",
                "cytochrome oxidase subunit I",
                "chytochrome c oxidase subunit I", "COI", "CO1", "CO 1",
                "CO I", "coi", "product: cytochrome c oxidase subunit I",
                "cytochrome oxidase subunit 1"
            ],
            [
                "COX2", "COXII", "cytochrome c oxidase subunit 2",
                "cytochrome c oxidase subunit II",
                "Cytochrome c oxidase subunit 2",
                "cytochrome oxidase subunit II",
                "chytochrome c oxidase subunit II", "COII", "CO2", "CO 2",
                "CO II", "coii", "cytochrome oxidase subunit 2"
            ],
            [
                "COX3", "COXIII", "cytochrome c oxidase subunit 3",
                "cytochrome c oxidase subunit III",
                "Cytochrome c oxidase subunit 3",
                "cytochrome oxidase subunit III",
                "chytochrome c oxidase subunit III", "COIII", "CO3", "CO 3",
                "CO III", "coiii", "cytochrome oxidase subunit 3"
            ],
            [
                "CYTB", "cytochrome b", "Cytochrome b", "cytb", "Cytb",
                "Cyt b", "Cytochrome b apoenzyme", "cytochrome b apoenzyme",
                "cytochrome b; TAA stop codon appears afterpolyadenylation"
            ],
            [
                "ND1", "nd1", "nd 1", "ND 1", "Nd 1",
                "NADH dehydrogenase subunit 1", "NADH hydrogenase subunit 1",
                "subunit 1 of the NADH ubiquinone oxidoreductase complex",
                "NADH-1", "NADH1"
            ],
            [
                "ND2", "nd2", "nd 2", "ND 2", "Nd 2",
                "NADH dehydrogenase subunit 2", "NADH hydrogenase subunit 2",
                "subunit 2 of the NADH ubiquinone oxidoreductase complex",
                "#NADH dehydrogenase subunit 2", "NADH-2", "NADH2"
            ],
            [
                "ND3", "nd3", "nd 3", "ND 3", "Nd 3",
                "NADH dehydrogenase subunit 3", "NADH hydrogenase subunit 3",
                "subunit 3 of the NADH ubiquinone oxidoreductase complex",
                "NADH-3", "NADH3"
            ],
            [
                "ND4", "nd4", "nd 4", "ND 4", "Nd 4",
                "NADH dehydrogenase subunit 4", "NADH hydrogenase subunit 4",
                "subunit 4 of the NADH ubiquinone oxidoreductase complex",
                "NADH-4", "NADH4"
            ],
            [
                "ND4L", "nd4l", "nd 4l", "ND 4l", "Nd 4l",
                "NADH dehydrogenase subunit 4L", "NADH hydrogenase subunit 4L",
                "NADH-4L", "NADH4L"
            ],
            [
                "ND5", "nd5", "nd 5", "ND 5", "Nd 5",
                "NADH dehydrogenase subunit 5", "NADH hydrogenase subunit 5",
                "subunit 5 of the NADH ubiquinone oxidoreductase complex",
                "NADH-5", "NADH5"
            ],
            [
                "ND6", "nd6", "nd 6", "ND 6", "Nd 6",
                "NADH dehydrogenase subunit 6", "NADH hydrogenase subunit 6",
                "subunit 6 of the NADH ubiquinone oxidoreductase complex",
                "NADH-6", "NADH6", "NADH dehydrogenase subunit-6"
            ], ["tRNA-Val"], ["tRNA-Leu"], ["tRNA-Phe"], ["tRNA-Pro"],
            ["tRNA-Thr"], ["tRNA-Glu"], ["tRNA-Ser"], ["tRNA-His"],
            ["tRNA-Arg"], ["tRNA-Gly"], ["tRNA-Lys"], ["tRNA-Asp"],
            ["tRNA-Tyr"], ["tRNA-Cys"], ["tRNA-Asn"], ["tRNA-Ala"],
            ["tRNA-Trp"], ["tRNA-Met"], ["tRNA-Ile"], ["tRNA-Gln"]
        ]
        self.protein_gene_list = [
            "ATP6",
            "ATP8",
            "COX1",
            "COX2",
            "COX3",
            "CYTB",
            "ND1",
            "ND2",
            "ND3",
            "ND4L",
            "ND4",
            "ND5",
            "ND6",
        ]

        self.rRNA_gene_list = ["12S_rRNA", "16S_rRNA"]
Пример #9
0
)
parser.add_argument(
    "-s",
    action="store_true",
    dest="use_strand",
    help="Define -l and -r based on strand. E.g. if used, -l 500 for a "
    "negative-stranded feature, it will start the flank 500 bp downstream.  Default = false."
)

args = parser.parse_args()

record_dict = SeqIO.index_db("temp_index.idx", [args.input_fasta],
                             format="fasta")

SequenceRoutines.get_lengths(record_dict,
                             out_file="fasta_lengths.t",
                             write=True)

if args.fraction_mode:
    left = float(args.left)
    right = float(args.right)
else:
    left = int(args.left)
    right = int(args.right)

Flank.get(args.bed,
          "fasta_lengths.t",
          left,
          right,
          fraction_mode=args.fraction_mode,
          strand_based=args.use_strand,
Пример #10
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = FileRoutines.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (FileRoutines.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)