Python Sequence.combineSequenceFilesBySpecies 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: sequences

클래스/타입: Sequence

메소드/함수: combineSequenceFilesBySpecies

hotexamples.com에서의 예제들: 2

Python Sequence.combineSequenceFilesBySpecies - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 sequences.Sequence.combineSequenceFilesBySpecies에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Sequence(17)

outputSequencesToFasta(5)

readSequences(4)

combineSequenceFilesBySpecies(2)

import_from_data(1)

예제 #1

파일 보기

    def predict_orthologs(self, files, workdir, speciestree_file):

        runOMA = True

        if os.path.isfile(join(workdir, "Output/OrthologousGroups.txt")
                          ) and "use_cache" in self.other_args:
            print("Will use cached output files from oma")
            runOMA = False

        #quite a few preprocessing steps are needed for OMA.
        #first, we create  directories and files required.

        dbdir = join(workdir, "DB")
        if not os.path.exists(dbdir):
            os.mkdir(dbdir)

        #OMA requires one file per species.  We split our files accordingly here.
        seqs_by_species = Sequence.combineSequenceFilesBySpecies(
            files, self.other_args["species_separator"],
            int(self.other_args["species_index"]))

        if speciestree_file == "":
            print("OMA is unpredicatable without a species tree.")

        spprefix = ""
        if "species_prefix" in self.other_args:
            spprefix = self.other_args["species_prefix"]

        doAA = False
        if "convertToAA" in self.other_args:
            doAA = True

        species_list = ""
        genes_list = []
        for key in seqs_by_species:
            outfile = join(dbdir, spprefix + key + ".fa")
            Sequence.outputSequencesToFasta(sequences=seqs_by_species[key],
                                            filename=outfile,
                                            name_suffix="",
                                            aligned=False,
                                            convertToAA=doAA,
                                            name_prefix=spprefix)

            if species_list != "":
                species_list += ","
            species_list += key

            for seq in seqs_by_species[key]:
                genes_list.append(seq.name)

        #if we use a species tree, OMA requires removing species not appearing in the files
        #sgutils can restrict the species tree to a subset of species.
        if speciestree_file != "":
            oma_sptree_file = join(workdir, "oma_species_tree.nw")

            with open(speciestree_file, 'r') as myfile:
                speciestree_newick = myfile.read().replace('\n', '')

            cmd = "OCR -m restrict_species_tree -l \"" + species_list + "\" -s \"" + speciestree_newick + "\" -o \"" + oma_sptree_file + "\""
            print("EXEC " + cmd)
            os.system(cmd)

            f = open(oma_sptree_file)
            speciestree_newick_restricted = f.readline().replace("\n", "")
            f.close()

            ################################################################
            #special case here: if only one species present, oma makes an error.
            #In this case, we make 1 gene = 1 cluster
            if "," not in speciestree_newick_restricted:
                print("Only one species found.  Will make all genes paralogs.")
                self.clusters_filenames = [join(workdir, "oma.clusters")]
                self.relations_filenames = [join(workdir, "oma.relations")]

                clusters = []
                for g in genes_list:
                    clusters.append([g])
                write_clusters(self.clusters_filenames[0], clusters)

                f = open(self.relations_filenames[0], 'w')
                tmp = ""
                for a in range(len(genes_list)):
                    for b in range(a + 1, len(genes_list)):
                        tmp += genes_list[a] + "\t" + genes_list[
                            b] + "\t" + "Paralogs;;"
                tmp = tmp[0:-2]
                f.write(tmp)
                f.close()

                return
            ################################################################

        input_type = "DNA"
        if "seqtype" in self.other_args and self.other_args["seqtype"] == "AA":
            print("Input type set to AA")
            input_type = self.other_args["seqtype"]

        #now, get into oma dir and execute it
        cwd = os.getcwd()

        os.chdir(workdir)

        os.system("rm parameters.drw")
        os.system("OMA -p")  #this creates a default config file.
        #we must edit the config to put "DNA", and give the species tree restrcted to the genes at hand.
        outcfg = ""
        f = open("parameters.drw")
        for line in f:
            line = line.replace("\n", "")
            if line.startswith("InputDataType"):
                line = "InputDataType := '" + input_type + "';"
            elif line.startswith("SpeciesTree") and speciestree_file != "":
                line = "SpeciesTree := '" + speciestree_newick_restricted + "';"

            outcfg += line + "\n"
        f.close()
        f = open("parameters.drw", 'w')
        f.write(outcfg)
        f.close()
        print("Config edited")

        cmd = "OMA -n 7"
        print("EXEC " + cmd)

        if runOMA:
            os.system(cmd)
        else:
            print("Not really - using cache instead.")

        os.chdir(cwd)

        #now that the inference is done, we translate the oma output into our format.
        #the result is a oma.clusters file and a oma.relations file.

        self.parse_groups(genes_list, workdir)
        self.parse_relations(genes_list, workdir)

        self.clusters_filenames = [join(workdir, "oma.clusters")]
        self.relations_filenames = [join(workdir, "oma.relations")]

예제 #2

파일 보기

    def predict_orthologs(self, files, workdir, speciestree_file):

        workdir_seqs = join(workdir, "in")
        workdir_out = join(workdir, "out")

        allseqs = []
        for f in files:
            seqs = Sequence.readSequences(f)
            for s in seqs:
                allseqs.append(s)

        #one file per species, see OMA comments above
        if len(self.cached_clusters) == 0:
            seqs_by_species = Sequence.combineSequenceFilesBySpecies(
                files, self.other_args["species_separator"],
                int(self.other_args["species_index"]))

            isdna = True
            if "seqtype" in self.other_args and self.other_args[
                    "seqtype"] == "AA":
                print("Input type set to AA")
                isdna = False

            if not os.path.exists(workdir_seqs):
                os.mkdir(workdir_seqs)
            for key in seqs_by_species:
                outfile = join(workdir_seqs, key + ".fasta")
                Sequence.outputSequencesToFasta(sequences=seqs_by_species[key],
                                                filename=outfile,
                                                name_suffix="",
                                                aligned=False,
                                                convertToAA=isdna,
                                                name_prefix="")

            cmd = "/u/lafonman/src/orthomcl-pipeline/bin/orthomcl-pipeline -i " + workdir_seqs + " -o " + workdir_out + " -m /u/lafonman/src/orthomcl-pipeline/orthomcl.conf --nocompliant --yes"
            print("EXEC " + cmd)
            os.system(cmd)

            seen_genes = set()
            clusters = []
            clfile = join(workdir_out, "groups/groups.txt")
            f = open(clfile, 'r')
            for line in f:
                line = line.replace("\n", "")
                if line != "":
                    gz = line.split(":")[1].split()
                    cluster = set()
                    for g in gz:
                        gname = g.split("|")[1]
                        cluster.add(gname)
                        seen_genes.add(gname)
                    clusters.append(cluster)

            f.close()

            for s in allseqs:
                name = s.name
                if not name in seen_genes:
                    cl = set()
                    cl.add(name)
                    clusters.append(cl)

            self.cached_clusters = clusters
        else:
            print("USING CACHED CLUSTERS")
            clusters = self.cached_clusters

        #restrict clusters to current family
        #f_set = set()
        #for s in gene_family.sequences:
        #	f_set.add(s.name)
        #f_clusters = []
        #for cl in clusters:
        #	inter = f_set.intersection(cl)
        #	if len(inter) > 0:
        #		f_clusters.append(inter)

        self.clusters_filenames = [join(workdir, "orthomcl.clusters")]
        self.relations_filenames = [join(workdir, "orthomcl.relations")]

        write_clusters(self.clusters_filenames[0], clusters)

        #output relations
        relstr = ""
        seen_keys = {}
        for c in clusters:
            for c1 in c:
                for c2 in c:
                    if c1 != c2:

                        key1 = c1 + ";;" + c2
                        key2 = c2 + ";;" + c1

                        if key1 not in seen_keys and key2 not in seen_keys:
                            seen_keys[key1] = 1
                            seen_keys[key2] = 1
                            relstr += c1 + "\t" + c2 + "\t"

                            sp1 = c1.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]
                            sp2 = c2.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]

                            if sp1 != sp2:
                                relstr += "Orthologs"
                            else:
                                relstr += "Paralogs"

                            relstr += ";;"
        relstr = relstr[0:-2]  #extra ;; at end

        for i in range(len(allseqs)):
            for j in range(i + 1, len(allseqs)):
                n1 = allseqs[i].name
                n2 = allseqs[j].name
                key = n1 + ";;" + n2
                if not key in seen_keys:
                    relstr += n1 + "\t" + n2 + "\t" + "Paralogs" + ";;"

        f = open(self.relations_filenames[0], 'w')
        f.write(relstr)
        f.close()