Пример #1
0
    def load_from_alignment_file(self, filename):
        '''
		Loads the sequences from the given sequence file (does not need to be an aligned file)
		'''
        self.seq_file = filename

        self.sequences = Sequence.readSequences(filename)
Пример #2
0
    def load_scores(self, gene_family):

        dealignedfile = gene_family.seq_file + ".dealigned.fa"
        Sequence.outputSequencesToFasta(gene_family.sequences, dealignedfile)

        fsafile = gene_family.seq_file + ".fsa.fa"

        cmd = "fsa " + dealignedfile + " > " + fsafile

        print("EXEC " + cmd)

        if not os.path.isfile(
                fsafile
        ) or not "use_cache" in self.other_args or os.path.getsize(
                fsafile) <= 10:
            os.system(cmd)
        else:
            print("Actually, file exists and we'll use it.")

        seqs = Sequence.readSequences(fsafile)

        gene_family.scores = Distances.getPairwisePctID(sequences=seqs,
                                                        verbose=False,
                                                        run_nw_algorithm=False)
Пример #3
0
#here's an annoying problem: when having multiple input files, some (distinct) genes will have the same name
#(this happens with simphy)
#so here we copy the alginment files, but rename the genes with its file index
newinfiles = ""
files_list = infiles.split(",")

newfile_to_old_file = {}

if rename_genes:
    print("Copying alignment files, ensuring gene name uniqueness...")
    for i in range(len(files_list)):
        f = files_list[i]
        filename, ext = os.path.splitext(f)
        newfile = join(workdir,
                       os.path.basename(f).replace(ext, "_" + str(i) + ".fa"))
        sequences = Sequence.readSequences(f)
        Sequence.outputSequencesToFasta(sequences, newfile, str(i), True)

        if newinfiles != "":
            newinfiles += ","
        newinfiles += newfile

        newfile_to_old_file[newfile] = f

    infiles = newinfiles
else:
    for i in range(len(files_list)):
        f = files_list[i]
        newfile_to_old_file[f] = f

#############################################################################################
Пример #4
0
    def predict_orthologs(self, files, workdir, speciestree_file):

        workdir_seqs = join(workdir, "in")
        workdir_out = join(workdir, "out")

        allseqs = []
        for f in files:
            seqs = Sequence.readSequences(f)
            for s in seqs:
                allseqs.append(s)

        #one file per species, see OMA comments above
        if len(self.cached_clusters) == 0:
            seqs_by_species = Sequence.combineSequenceFilesBySpecies(
                files, self.other_args["species_separator"],
                int(self.other_args["species_index"]))

            isdna = True
            if "seqtype" in self.other_args and self.other_args[
                    "seqtype"] == "AA":
                print("Input type set to AA")
                isdna = False

            if not os.path.exists(workdir_seqs):
                os.mkdir(workdir_seqs)
            for key in seqs_by_species:
                outfile = join(workdir_seqs, key + ".fasta")
                Sequence.outputSequencesToFasta(sequences=seqs_by_species[key],
                                                filename=outfile,
                                                name_suffix="",
                                                aligned=False,
                                                convertToAA=isdna,
                                                name_prefix="")

            cmd = "/u/lafonman/src/orthomcl-pipeline/bin/orthomcl-pipeline -i " + workdir_seqs + " -o " + workdir_out + " -m /u/lafonman/src/orthomcl-pipeline/orthomcl.conf --nocompliant --yes"
            print("EXEC " + cmd)
            os.system(cmd)

            seen_genes = set()
            clusters = []
            clfile = join(workdir_out, "groups/groups.txt")
            f = open(clfile, 'r')
            for line in f:
                line = line.replace("\n", "")
                if line != "":
                    gz = line.split(":")[1].split()
                    cluster = set()
                    for g in gz:
                        gname = g.split("|")[1]
                        cluster.add(gname)
                        seen_genes.add(gname)
                    clusters.append(cluster)

            f.close()

            for s in allseqs:
                name = s.name
                if not name in seen_genes:
                    cl = set()
                    cl.add(name)
                    clusters.append(cl)

            self.cached_clusters = clusters
        else:
            print("USING CACHED CLUSTERS")
            clusters = self.cached_clusters

        #restrict clusters to current family
        #f_set = set()
        #for s in gene_family.sequences:
        #	f_set.add(s.name)
        #f_clusters = []
        #for cl in clusters:
        #	inter = f_set.intersection(cl)
        #	if len(inter) > 0:
        #		f_clusters.append(inter)

        self.clusters_filenames = [join(workdir, "orthomcl.clusters")]
        self.relations_filenames = [join(workdir, "orthomcl.relations")]

        write_clusters(self.clusters_filenames[0], clusters)

        #output relations
        relstr = ""
        seen_keys = {}
        for c in clusters:
            for c1 in c:
                for c2 in c:
                    if c1 != c2:

                        key1 = c1 + ";;" + c2
                        key2 = c2 + ";;" + c1

                        if key1 not in seen_keys and key2 not in seen_keys:
                            seen_keys[key1] = 1
                            seen_keys[key2] = 1
                            relstr += c1 + "\t" + c2 + "\t"

                            sp1 = c1.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]
                            sp2 = c2.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]

                            if sp1 != sp2:
                                relstr += "Orthologs"
                            else:
                                relstr += "Paralogs"

                            relstr += ";;"
        relstr = relstr[0:-2]  #extra ;; at end

        for i in range(len(allseqs)):
            for j in range(i + 1, len(allseqs)):
                n1 = allseqs[i].name
                n2 = allseqs[j].name
                key = n1 + ";;" + n2
                if not key in seen_keys:
                    relstr += n1 + "\t" + n2 + "\t" + "Paralogs" + ";;"

        f = open(self.relations_filenames[0], 'w')
        f.write(relstr)
        f.close()