Exemplo n.º 1
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning(
                "Trimming was to aggressive and it tried"
                " to remove one or more sequences."
                " Alignment trimming will be disabled for this dataset.")
            self.clean_alg_fasta_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = map(int, line.split("\t")[1].split(","))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Exemplo n.º 2
0
def switch_to_codon(alg_fasta_file,  kept_columns=None):
    # Check conservation of columns. If too many identities,
    # switch to codon alignment and make the tree with DNA. 
    # Mixed models is another possibility.
    if kept_columns:
        kept_columns = set(map(int, kept_columns))
    else:
        kept_columns = []

    #all_nt_alg = SeqGroup(nt_seed_file)
    aa_alg = SeqGroup(alg_fasta_file)
    nt_alg = SeqGroup()

    for seqname, aaseq, comments in aa_alg.iter_entries():
        #ntseq = all_nt_alg.get_seq(seqname).upper()
        ntseq = db.get_seq(seqname, "nt").upper()
        ntalgseq = []
        nt_pos = 0
        for pos, ch in enumerate(aaseq):
            if ch in GAP_CHARS:
                codon = "---"
            else:
                codon = ntseq[nt_pos:nt_pos+3]
                nt_pos += 3

            if not kept_columns or pos in kept_columns: 
                # we trust the sequence in DB, consistency should have been
                # checked during the start up
                ntalgseq.append(codon)

        ntalgseq = "".join(ntalgseq)
        nt_alg.set_seq(seqname, ntalgseq)

    return nt_alg
Exemplo n.º 3
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format. 
     alg = SeqGroup(self.jobs[0].stdout_file)
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Exemplo n.º 4
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     final_job = self.jobs[2]
     alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta"))
     alg.write(outfile=self.alg_fasta_file, format="fasta")
     alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed")
     AlgTask.finish(self)
Exemplo n.º 5
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format.
        alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta"))
        fasta = alg.write(format="fasta")
        phylip = alg.write(format="iphylip_relaxed")

        alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"],
                                           aname) for aname in self.all_alg_files])
        db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string)
        
        AlgTask.store_data(self, fasta, phylip)
Exemplo n.º 6
0
 def finish(self):
     if self.conf[self.confname]["_alg_trimming"]:
         # If trimming happened after mcoffee, let's save the
         # resulting output
         trim_job = self.jobs[-1]
         alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file))
         fasta = alg.write(format="fasta")
         phylip = alg.write(format="iphylip_relaxed")
         AlgTask.store_data(self, fasta, phylip)
     else:
         # If no post trimming, output is just what Mcoffee
         # produced, so we can recycle its data ids.
         mc_task = self.jobs[-1]
         fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta)
         phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip)
         db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id)
         db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
Exemplo n.º 7
0
def switch_to_codon(alg_fasta_file, kept_columns=None):
    # Check conservation of columns. If too many identities,
    # switch to codon alignment and make the tree with DNA.
    # Mixed models is another possibility.
    if kept_columns:
        kept_columns = set(map(int, kept_columns))
    else:
        kept_columns = []

    #all_nt_alg = SeqGroup(nt_seed_file)
    aa_alg = SeqGroup(alg_fasta_file)
    nt_alg = SeqGroup()

    for seqname, aaseq, comments in aa_alg.iter_entries():
        #ntseq = all_nt_alg.get_seq(seqname).upper()
        ntseq = db.get_seq(seqname, "nt").upper()
        ntalgseq = []
        nt_pos = 0
        for pos, ch in enumerate(aaseq):
            if ch in GAP_CHARS:
                codon = "---"
            else:
                codon = ntseq[nt_pos:nt_pos + 3]
                nt_pos += 3

            if not kept_columns or pos in kept_columns:
                # we trust the sequence in DB, consistency should have been
                # checked during the start up
                ntalgseq.append(codon)

        ntalgseq = "".join(ntalgseq)
        nt_alg.set_seq(seqname, ntalgseq)

    return nt_alg
Exemplo n.º 8
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     final_job = self.jobs[2]
     alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta"))
     alg.write(outfile=self.alg_fasta_file, format="fasta")
     alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed")
     AlgTask.finish(self)
Exemplo n.º 9
0
Arquivo: trimal.py Projeto: a1an77/ete
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning("Trimming was to aggressive and it tried"
                        " to remove one or more sequences."
                        " Alignment trimming will be disabled for this dataset."
                        )
            self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = map(int, line.split("\t")[1].split(","))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Exemplo n.º 10
0
 def finish(self):
     alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta"))
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Exemplo n.º 11
0
                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])


                    log.log(28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                            threadname, final_tree_file+".nw",
                            final_tree_file+".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file+".nw")
                    main_tree.write(outfile=final_tree_file+ ".nwx", features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(28, "Writing root node alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file+".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print >>OUT, ">%s\n%s" %(realname, seq)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".trimmed.fa")

                        alg = SeqGroup(get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file+".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print >>OUT, ">%s\n%s" %(realname, seq)
Exemplo n.º 12
0
def get_concatenated_alg(alg_filenames,
                         models=None,
                         sp_field=0,
                         sp_delimiter="_",
                         kill_thr=0.0,
                         keep_species=set()):
    # Concat alg container
    concat = SeqGroup()
    # Used to store different model partitions
    concat.id2partition = {}

    if not models:
        models = ["None"] * len(alg_filenames)
    else:
        if len(models) != len(alg_filenames):
            raise ValueError(
                "Different number of algs and model names was found!")

    expected_total_length = 0
    # Check algs and gets the whole set of species
    alg_objects = []
    sp2alg = defaultdict(list)

    for algfile, matrix in zip(alg_filenames, models):
        alg = SeqGroup(algfile, "fasta")
        alg_objects.append(alg)
        lenseq = None
        browsed_species = set()
        alg.sp2seq = {}
        # Set best matrix for this alignment
        alg.matrix = matrix
        # Change seq names to contain only species names
        for i, seq in alg.id2seq.iteritems():
            name = db.get_seq_name(alg.id2name[i])
            taxid = get_species_code(name,
                                     splitter=sp_delimiter,
                                     field=sp_field)
            if lenseq is not None and len(seq) != lenseq:
                raise Exception(
                    "Inconsistent alignment when concatenating: Unequal length"
                )
            elif lenseq is None:
                lenseq = len(seq)
                alg.seqlength = len(seq)
                expected_total_length += len(seq)
            if taxid in browsed_species:
                raise Exception(
                    "Inconsistent alignment when concatenating: Repeated species"
                )
            browsed_species.add(
                taxid)  # Check no duplicated species in the same alg
            sp2alg[taxid].append(alg)  # Records all species seen in all algs.
            alg.sp2seq[taxid] = seq

    valid_species = [sp for sp in sp2alg.iterkeys() \
                         if sp in keep_species or \
                         len(sp2alg[sp])/float(len(alg_objects)) > kill_thr]

    log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\
                 (len(valid_species), len(sp2alg), kill_thr, len(keep_species)))

    def sort_single_algs(alg1, alg2):
        r = cmp(alg1.matrix, alg2.matrix)
        if r == 0:
            return cmp(sorted(alg1.id2name.values()),
                       sorted(alg2.id2name.values()))
        else:
            return r

    sorted_algs = sorted(alg_objects, sort_single_algs)
    concat_alg_lengths = [alg.seqlength for alg in sorted_algs]
    model2win = {}
    model2size = {}
    for alg in sorted_algs:
        model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength

    # Create concat alg
    concat.id2seq = defaultdict(list)
    for sp in sorted(valid_species):
        log.log(20, "Concatenating sequences of [%s]" % sp)
        for alg in sorted_algs:
            seq = alg.sp2seq.get(sp, "-" * alg.seqlength)
            concat.id2seq[sp].append(seq)
            #current_seq = concat.id2seq.get(sp, "")
            #concat.id2seq[sp] = current_seq + seq.strip()
            concat.id2name[sp] = sp
            concat.name2id[sp] = sp
            concat.id2comment[sp] = [""]
        concat.id2seq[sp] = ''.join(concat.id2seq[sp])

    current_pos = 0
    partitions = []
    for model in sorted(model2size.keys()):
        size = model2size[model]
        part = "%s, %s = %d-%d" % (model, model+"_genes", \
                                       current_pos + 1,\
                                       current_pos + size)
        current_pos += size
        partitions.append(part)

    # Basic Checks
    seq_sizes = [len(seq) for seq in concat.id2seq.values()]
    if len(set(seq_sizes)) != 1:
        raise Exception(
            "Concatenated alignment is not consistent: unequal seq length ")
    if seq_sizes[0] != expected_total_length:
        raise Exception("The size of concatenated alg is not what expected")
    return concat, partitions, sp2alg, valid_species, concat_alg_lengths
Exemplo n.º 13
0
                    log.log(
                        28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                        threadname, final_tree_file + ".nw",
                        final_tree_file + ".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file + ".nw")
                    main_tree.write(outfile=final_tree_file + ".nwx",
                                    features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(
                            28,
                            "Writing root node alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file + ".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print >> OUT, ">%s\n%s" % (realname, seq)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(
                            28,
                            "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".trimmed.fa")

                        alg = SeqGroup(
                            get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file + ".trimmed.fa", "w")
Exemplo n.º 14
0
def get_concatenated_alg(alg_filenames, models=None, 
                        sp_field=0, sp_delimiter="_", 
                        kill_thr=0.0, 
                        keep_species=set()):
    # Concat alg container 
    concat = SeqGroup()
    # Used to store different model partitions
    concat.id2partition = {}

    if not models: 
        models = ["None"]*len(alg_filenames)
    else:
        if len(models) != len(alg_filenames):
            raise ValueError("Different number of algs and model names was found!")

    expected_total_length = 0
    # Check algs and gets the whole set of species
    alg_objects = []
    sp2alg = defaultdict(list)
    
    for algfile, matrix in zip(alg_filenames, models):
        alg = SeqGroup(algfile, "fasta")
        alg_objects.append(alg)
        lenseq = None
        browsed_species = set()
        alg.sp2seq = {}
        # Set best matrix for this alignment
        alg.matrix = matrix
        # Change seq names to contain only species names
        for i, seq in alg.id2seq.iteritems():
            name = db.get_seq_name(alg.id2name[i])
            taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field)
            if lenseq is not None and len(seq) != lenseq:
                raise Exception("Inconsistent alignment when concatenating: Unequal length")
            elif lenseq is None:
                lenseq = len(seq)
                alg.seqlength = len(seq)
                expected_total_length += len(seq)
            if taxid in browsed_species:
                raise Exception("Inconsistent alignment when concatenating: Repeated species")
            browsed_species.add(taxid) # Check no duplicated species in the same alg
            sp2alg[taxid].append(alg) # Records all species seen in all algs.
            alg.sp2seq[taxid] = seq

    valid_species = [sp for sp in sp2alg.iterkeys() \
                         if sp in keep_species or \
                         len(sp2alg[sp])/float(len(alg_objects)) > kill_thr]

    log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\
                 (len(valid_species), len(sp2alg), kill_thr, len(keep_species)))

    def sort_single_algs(alg1, alg2):
        r = cmp(alg1.matrix, alg2.matrix)
        if r == 0:
            return cmp(sorted(alg1.id2name.values()),
                       sorted(alg2.id2name.values()))
        else:
            return r
           
    sorted_algs = sorted(alg_objects, sort_single_algs)
    concat_alg_lengths = [alg.seqlength for alg in sorted_algs]
    model2win = {}
    model2size = {}
    for alg in sorted_algs:
        model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength

    # Create concat alg
    concat.id2seq = defaultdict(list)
    for sp in sorted(valid_species):
        log.log(20, "Concatenating sequences of [%s]" %sp)
        for alg in sorted_algs:
            seq = alg.sp2seq.get(sp, "-" * alg.seqlength)
            concat.id2seq[sp].append(seq)
            #current_seq = concat.id2seq.get(sp, "")
            #concat.id2seq[sp] = current_seq + seq.strip()
            concat.id2name[sp] = sp 
            concat.name2id[sp] = sp
            concat.id2comment[sp] = [""]
        concat.id2seq[sp] = ''.join(concat.id2seq[sp])

    current_pos = 0
    partitions = []
    for model in sorted(model2size.keys()):
        size = model2size[model]
        part = "%s, %s = %d-%d" % (model, model+"_genes", \
                                       current_pos + 1,\
                                       current_pos + size)
        current_pos += size
        partitions.append(part)

    # Basic Checks
    seq_sizes = [len(seq) for seq in concat.id2seq.values()]
    if len(set(seq_sizes)) != 1:
        raise Exception("Concatenated alignment is not consistent: unequal seq length ")
    if seq_sizes[0] != expected_total_length:
        raise Exception("The size of concatenated alg is not what expected")
    return concat, partitions, sp2alg, valid_species, concat_alg_lengths