def testAlignmentReadFasta(self): alg = MutableAlignment() alg.read_filepath(get_data_path("mock/pyrg/sate.fasta")) assert len(alg) == 65, "MutableAlignment length is %s" % len(alg) assert all([not alg.is_all_gap(i) for i in range(0, alg.get_length())])
def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ((not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None)): options().fragment_file = options().sequence_file elif ((options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None)): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self)
def check_options(self): options().info_file = "A_dummy_value" if options().tree_file is None or options().alignment_file is None: _LOG.error("Specify the backbone alignment and tree and query sequences") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) return ExhaustiveAlgorithm.check_options(self)
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name+'_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir+"/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) gene_set = marker_genes align_name = 'sate' if (options().genes == 'cogs'): gene_set = cog_genes align_name = 'pasta' for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%.profile' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(seq, gene_file + ".fixed") return genes
def testAlignmentReadFasta(self): print "====== starting testAlignmentReadFasta ==========" alg = MutableAlignment() alg.read_filepath("data/mock/pyrg/sate.fasta") print "Maing alignment is:\n\n", alg assert len(alg) == 65, "MutableAlignment length is %s" %len(alg) assert all([not alg.is_all_gap(i) for i in xrange(0,alg.get_length())])
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()]) gene_set = marker_genes align_name = "sate" if options().genes == "cogs": gene_set = cog_genes align_name = "pasta" for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name) ), temp_dir + "/%s.out" % gene, ) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name in results.keys(): bitscore = results[name][1] direction = "forward" true_name = name if name.find("_rev") != -1: true_name = true_name.replace("_rev", "") direction = "reverse" if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name in frag_scores.keys(): if frag_scores[name][1] not in genes: genes[frag_scores[name][1]] = {} if frag_scores[name][2] == "forward": genes[frag_scores[name][1]][name] = fragments[name] else: genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene in genes.keys(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(genes[gene], gene_file + ".fixed") return genes
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene is None): # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: list(fragments.keys())} # Now figure out direction of fragments binned_fragments = dict([ (gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) print("Finding best orientation of reads\n") align_name = 'sate' if (options().genes == 'cogs'): align_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key+"_rev" in results): backward_score = results[key + "_rev"][1] if (backward_score > forward_score): frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def read_alignment_and_tree(self): _LOG.info("Reading input alignment: %s" %(self.options.alignment_file)) alignment = MutableAlignment() alignment.read_file_object(self.options.alignment_file) #fragments = MutableAlignment() #fragments.read_file_object(self.options.fragment_file); _LOG.info("Reading input tree: %s" %(self.options.tree_file)) tree = PhylogeneticTree( dendropy.Tree(stream=self.options.tree_file, schema="newick", preserve_underscores=True)) return (alignment, tree)
def testReadOnlySubAlignment(self): alg = MutableAlignment() alg.read_filepath(get_data_path("mock/pyrg/sate.fasta")) subset = [ 'NC_008701_720717_722309', 'NC_013156_149033_150643', 'NC_013887_802739_801129' ] readonly_subalignment = ReadonlySubalignment(subset, alg) assert len(readonly_subalignment) == 3, len(readonly_subalignment) assert set(readonly_subalignment.keys()) == set( readonly_subalignment.get_sequence_names()) == set(subset), \ "Subalignment keys not matching given keys %s vs %s" % ( list(readonly_subalignment.keys()), subset) for (k, s) in list(readonly_subalignment.items()): assert k in subset, \ "%s not found in subset but returned by subalignment" % k assert s == alg[k], \ "sequence associated with %s not matching parent alignment" % k try: readonly_subalignment[2] = "ACGT" assert False, "Readony alignment is successfully modified. " except TypeError: pass assert readonly_subalignment.get_length() == alg.get_length(), \ "alignment length should not change" assert readonly_subalignment.is_aligned() is True assert readonly_subalignment.is_all_gap(2) is True, \ "Site 2 should be all gaps" assert readonly_subalignment.is_all_gap(150) is False, \ "Site 100 should not be all gaps" readonly_subalignment.write_to_path( self.fp_dummy1) # "mock/pyrg/sate.sub.fasta" mutable_subalignment = readonly_subalignment.get_mutable_alignment() mutable_subalignment.delete_all_gap() assert all([ not mutable_subalignment.is_all_gap(i) for i in range(0, mutable_subalignment.get_length()) ])
def read_alignment_and_tree(self): _LOG.info("Reading input alignment: %s" % (self.options.alignment_file)) alignment = MutableAlignment() alignment.read_file_object(self.options.alignment_file) # fragments = MutableAlignment() # fragments.read_file_object(self.options.fragment_file); _LOG.info("Reading input tree: %s" % self.options.tree_file) tree = PhylogeneticTree( dendropy.Tree.get_from_stream(self.options.tree_file, schema="newick", preserve_underscores=True)) return (alignment, tree)
def main(): args = parse_args() sequences = MutableAlignment() assert os.path.isfile(args.input) and os.access(args.input, os.R_OK), "Input file %s does not exist\n" % args.input sequences.read_file_object(args.input) frag = MutableAlignment() full = MutableAlignment() for (key,seq) in sequences.items(): if (len(seq) <= args.threshold): frag[key]=seq else: full[key]=seq frag.write_to_path("%s.frag.fas" % args.output) full.write_to_path("%s.full.fas" % args.output)
def read_and_divide_fragments(self, chunks, extra_frags={}): max_chunk_size = self.options.max_chunk_size _LOG.debug( "start reading fragment files and breaking to at least %s chunks but at most %s sequences " % (str(chunks), str(max_chunk_size))) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) # test if input fragment names might collide with reference names. # code contribution by Stefan Janssen (June 13th, 2018) ids_reference = set(self.root_problem.subalignment.keys()) ids_inputfragments = set(self.root_problem.fragments.keys()) ids_overlap = ids_reference & ids_inputfragments if len(ids_overlap) > 0: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "overlap with names in your reference. Please rename your inp" "ut fragments and re-start. Duplicate names are:\n '%s'") % (len(ids_overlap), "'\n '".join(ids_overlap))) for (k, v) in extra_frags.items(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks( chunks, max_chunk_size) ret = [] for i in range(0, len(alg_chunks)): temp_file = None if alg_chunks[i]: temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def figureout_fragment_subset(self): ''' Figure out which fragment should go to which subproblem''' # We need to keep and check the following flag because of checkpoining scenarios (join already done before!) if self.root_problem.annotations.has_key( "fragments.distribution.done"): return bitscores = dict([(name, []) for name in self.root_problem.fragments.keys()]) for fragment_chunk_problem in self.root_problem.iter_leaves(): align_problem = fragment_chunk_problem.get_parent() assert isinstance(align_problem, SeppProblem) '''For each subproblem start with an empty set of fragments, and add to them as we encounter new best hits for that subproblem''' if align_problem.fragments is None: align_problem.fragments = MutableAlignment() search_res = fragment_chunk_problem.get_job_result_by_name( "hmmsearch") for key in search_res.keys(): ''' keep a list of all hits, and their bit scores''' bitscores[key].append((search_res[key][1], align_problem)) for frag, tuplelist in bitscores.iteritems(): ''' TODO: what to do with those that are not? For now, only output warning message''' #TODO: Need to double check and fix the math if len(tuplelist) == 0: _LOG.warning("Fragment %s is not scored against any subset" % str(frag)) continue ''' convert bit scores to probabilities ''' denum = sum(math.pow(2, min(x[0], 1022)) for x in tuplelist) tuplelist = [((math.pow(2, min(x[0], 1022)) / denum * 1000000), x[1]) for x in tuplelist] ''' Sort subsets by their probability''' tuplelist.sort(reverse=True) ''' Find enough subsets to reach the threshold ''' selected = tuplelist[0:max( 1, reduce( lambda x, y: (x[0], None) if x[1] is None else (y[0], x[1] + y[1]) if x[1] < int(1000000 * self.alignment_threshold) else (y[0], None), enumerate([x[0] for x in tuplelist]))[0])] ''' Renormalize the selected list to add up to 1''' renorm = 0 for (prob, align_problem) in selected: renorm = renorm + prob / 1000000 renorm = 1 / renorm _LOG.debug("Fragment %s assigned to %d subsets" % (frag, len(selected))) ''' Rename the fragment and assign it to the respective subsets''' for (prob, align_problem) in selected: postfix = prob * renorm if options( ).exhaustive.weight_placement_by_alignment.lower( ) == "true" else 1000000 frag_rename = "%s_%s_%d" % (frag, align_problem.label, postfix) align_problem.fragments[ frag_rename] = self.root_problem.fragments[frag] self.root_problem.annotations["fragments.distribution.done"] = 1
def read_and_divide_fragments(self, chunks, extra_frags={}): max_chunk_size = self.options.max_chunk_size _LOG.debug( ("start reading fragment files and breaking to at least %s chunks" " but at most %s sequences ") % (str(chunks), str(max_chunk_size))) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) # test if input fragment names might collide with reference names. # code contribution by Stefan Janssen (June 13th, 2018) ids_reference = set(self.root_problem.subalignment.keys()) ids_inputfragments = set(self.root_problem.fragments.keys()) ids_overlap = ids_reference & ids_inputfragments if len(ids_overlap) > 0 and not self.options.ignore_overlap: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "overlap with names in your reference. Please rename your inp" "ut fragments and re-start. Duplicate names are:\n '%s'") % (len(ids_overlap), "'\n '".join(ids_overlap))) elif len(ids_overlap) > 0: _LOG.debug("Ignoring following %i query sequences present " "in the backbone: \n '%s'" % (len(ids_overlap), "' , '".join(ids_overlap))) self.root_problem.fragments = self.root_problem.fragments.\ get_soft_sub_alignment(ids_inputfragments - ids_reference) # test if input fragment names contain whitespaces / tabs which would # cause hmmsearch to fail. # code contribution by Stefan Janssen (June 22nd, 2018) ids_inputfragments_spaces = [ id_ for id_ in ids_inputfragments if (' ' in id_) or ('\t' in id_) ] if len(ids_inputfragments_spaces) > 0: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "contain either whitespaces: ' ' or tabulator '\\t' symbols. " "Please rename your input fragments and re-start. Affected " "names are:\n '%s'") % (len(ids_inputfragments_spaces), "'\n '".join(ids_inputfragments_spaces))) for (k, v) in extra_frags.items(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks( chunks, max_chunk_size) ret = [] for i in range(0, len(alg_chunks)): temp_file = None if alg_chunks[i]: temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100,int(.20*sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def testReadOnlySubAlignment(self): alg = MutableAlignment() alg.read_filepath(get_data_path("mock/pyrg/sate.fasta")) subset = ['NC_008701_720717_722309', 'NC_013156_149033_150643', 'NC_013887_802739_801129'] readonly_subalignment = ReadonlySubalignment(subset, alg) assert len(readonly_subalignment) == 3, len(readonly_subalignment) assert set(readonly_subalignment.keys()) == set( readonly_subalignment.get_sequence_names()) == set(subset), \ "Subalignment keys not matching given keys %s vs %s" % ( list(readonly_subalignment.keys()), subset) for (k, s) in list(readonly_subalignment.items()): assert k in subset, \ "%s not found in subset but returned by subalignment" % k assert s == alg[k], \ "sequence associated with %k not matching parent alignment" % k try: readonly_subalignment[2] = "ACGT" assert False, "Readony alignment is successfully modified. " except TypeError: pass assert readonly_subalignment.get_length() == alg.get_length(), \ "alignment length should not change" assert readonly_subalignment.is_aligned() is True assert readonly_subalignment.is_all_gap(2) is True, \ "Site 2 should be all gaps" assert readonly_subalignment.is_all_gap(150) is False, \ "Site 100 should not be all gaps" readonly_subalignment.write_to_path( self.fp_dummy1) # "mock/pyrg/sate.sub.fasta" mutable_subalignment = readonly_subalignment.get_mutable_alignment() mutable_subalignment.delete_all_gap() assert all([not mutable_subalignment.is_all_gap(i) for i in range(0, mutable_subalignment.get_length())])
def testReadOnlySubAlignment(self): print "======= starting testReadOnlySubAlignment =========" alg = MutableAlignment() alg.read_filepath("data/mock/pyrg/sate.fasta") subset = alg.keys()[9:12] readonly_subalignment = ReadonlySubalignment(subset, alg) print "subalignment is:\n\n", readonly_subalignment assert len(readonly_subalignment) == 3, len(readonly_subalignment) assert readonly_subalignment.keys() == readonly_subalignment.get_sequence_names() == subset, "Subalignment keys not matching given keys %s vs %s" %(readonly_subalignment.keys() , subset) for (k, s) in readonly_subalignment.items(): assert k in subset, "%s not found in subset but returned by subalignment" %k assert s == alg[k], "sequence associated with %k not matching parent alignment" %k try: readonly_subalignment[2] = "ACGT" assert False, "Readony alignment is successfully modified. " except TypeError: pass assert readonly_subalignment.get_length() == alg.get_length(), "alignment length should not change" assert readonly_subalignment.is_aligned() == True assert readonly_subalignment.is_all_gap(2) == True, "Site 2 should be all gaps" assert readonly_subalignment.is_all_gap(150) == False, "Site 100 should not be all gaps" readonly_subalignment.write_to_path("data/mock/pyrg/sate.sub.fasta") mutable_subalignment = readonly_subalignment.get_mutable_alignment() mutable_subalignment.delete_all_gap() assert all([not mutable_subalignment.is_all_gap(i) for i in xrange(0,mutable_subalignment.get_length())]) print "======= finishing testReadOnlySubAlignment ========="
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" %(options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self)
def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) if self.options.backtranslation_sequence_file: outfilename = self.get_output_filename( "backtranslated_alignment.fasta") backtranslation_seqs = MutableAlignment() backtranslation_seqs.read_file_object( self.options.backtranslation_sequence_file) try: extended_backtranslated_alignment = backtranslate( self.results, backtranslation_seqs) except Exception as e: _LOG.warning("Backtranslation failed due " "to following error: " + str(e) + ".\n" "No translated DNA sequence will be " "written to a file.") pass else: extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated alignment written to %s" % outfilename) extended_backtranslated_alignment.remove_insertion_columns() outfilename = self.get_output_filename( "backtranslated_alignment_masked.fasta") extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated masked alignment written " "to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename)
def testAlignmentReadFasta(self): print("====== starting testAlignmentReadFasta ==========") alg = MutableAlignment() alg.read_filepath("data/mock/pyrg/sate.fasta") print("Maing alignment is:\n\n", alg) assert len(alg) == 65, "MutableAlignment length is %s" % len(alg) assert all([not alg.is_all_gap(i) for i in range(0, alg.get_length())])
def read_and_divide_fragments(self, chunks, extra_frags={}): _LOG.debug("start reading fragment files and breaking to chunks: %d" % chunks) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) for (k, v) in extra_frags.iteritems(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks) ret = [] for i in xrange(0, chunks): temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" %(options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().alignment_size is None: _LOG.info("Alignment subset size not given. Calculating subset size. ") alignment = MutableAlignment() alignment.read_file_object(open(self.options.alignment_file.name)) if (options().molecule == 'amino'): _LOG.warning("Automated alignment subset selection not implemented for protein alignment. Setting to 10.") options().alignment_size = 10 else: (averagep,maxp) = alignment.get_p_distance() align_size = 10 if (averagep > .60): while (align_size*2 < alignment.get_num_taxa()): align_size = align_size * 2 _LOG.info("Average p-distance of backbone is %f0.2. Alignment subset size set to %d. " % (averagep,align_size)) options().alignment_size = align_size return ExhaustiveAlgorithm.check_options(self)
def testReadOnlySubAlignment(self): print("======= starting testReadOnlySubAlignment =========") alg = MutableAlignment() alg.read_filepath("data/mock/pyrg/sate.fasta") subset = list(alg.keys())[9:12] readonly_subalignment = ReadonlySubalignment(subset, alg) print("subalignment is:\n\n", readonly_subalignment) assert len(readonly_subalignment) == 3, len(readonly_subalignment) assert set(readonly_subalignment.keys()) == set( readonly_subalignment.get_sequence_names()) == set( subset ), "Subalignment keys not matching given keys %s vs %s" % (list( readonly_subalignment.keys()), subset) for (k, s) in list(readonly_subalignment.items()): assert k in subset, "%s not found in subset but returned by subalignment" % k assert s == alg[ k], "sequence associated with %k not matching parent alignment" % k try: readonly_subalignment[2] = "ACGT" assert False, "Readony alignment is successfully modified. " except TypeError: pass assert readonly_subalignment.get_length() == alg.get_length( ), "alignment length should not change" assert readonly_subalignment.is_aligned() == True assert readonly_subalignment.is_all_gap( 2) == True, "Site 2 should be all gaps" assert readonly_subalignment.is_all_gap( 150) == False, "Site 100 should not be all gaps" readonly_subalignment.write_to_path("data/mock/pyrg/sate.sub.fasta") mutable_subalignment = readonly_subalignment.get_mutable_alignment() mutable_subalignment.delete_all_gap() assert all([ not mutable_subalignment.is_all_gap(i) for i in range(0, mutable_subalignment.get_length()) ]) print("======= finishing testReadOnlySubAlignment =========")
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100, int(.20 * sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment( random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone, options().backbone_size, self.options.outdir, moleculeType, options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def hmmer_to_markers(input, temp_dir): global refpkg fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) for gene in refpkg["genes"]: # Now run HMMER search hmmer_output = temp_dir + '/' + gene + ".out" hmmer_search(frag_file, refpkg[gene]["hmm"], hmmer_output) results = read_hmmsearch_results(hmmer_output) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + '/' + gene + ".frags.fas.fixed" _write_fasta(seq, gene_file) binned_fragments = {} for gene, seq in genes.items(): binned_fragments[gene] = {} binned_fragments[gene]["file"] = temp_dir + '/' + gene \ + ".frags.fas.fixed" binned_fragments[gene]["nfrags"] = len(seq.keys()) return binned_fragments
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = ( seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] (min_length, max_length) = ( int(options().median_full_length * ( 1 - options().backbone_threshold)), int(options().median_full_length*( 1 + options().backbone_threshold))) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): _LOG.info( "Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted(random.sample( sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def main(): args = parse_args() sequences = MutableAlignment() assert os.path.isfile(args.input) and os.access( args.input, os.R_OK), "Input file %s does not exist\n" % args.input sequences.read_file_object(args.input) frag = MutableAlignment() full = MutableAlignment() for (key, seq) in sequences.items(): if (len(seq) <= args.threshold): frag[key] = seq else: full[key] = seq frag.write_to_path("%s.frag.fas" % args.output) full.write_to_path("%s.full.fas" % args.output)
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options( ).alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options( ).alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error( "Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid" ) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( "Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().alignment_size is None: _LOG.info( "Alignment subset size not given. Calculating subset size. ") alignment = MutableAlignment() alignment.read_file_object(open(self.options.alignment_file.name)) if (options().molecule == 'amino'): _LOG.warning( "Automated alignment subset selection not implemented for protein alignment. Setting to 10." ) options().alignment_size = 10 else: (averagep, maxp) = alignment.get_p_distance() align_size = 10 if (averagep > .60): while (align_size * 2 < alignment.get_num_taxa()): align_size = align_size * 2 _LOG.info( "Average p-distance of backbone is %f0.2. Alignment subset size set to %d. " % (averagep, align_size)) options().alignment_size = align_size return ExhaustiveAlgorithm.check_options(self)
# Make sure to do this before the last line relabeling columns, since that's usually the line that errors. # backbone alignment original_backbone_file = ( '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/orig_backbone.txt' ) new_backbone_file = "new_backbone_file.txt" with open(original_backbone_file, "r") as reader: with open(new_backbone_file, "w+") as writer: for line in reader.readlines(): if line[0] == ">": writer.write(line.upper()) else: writer.write(line) original_backbone = MutableAlignment() done = original_backbone.read_filepath(new_backbone_file) # all query sequences original_frag_file = ( '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/all_query.txt' ) original_frag = MutableAlignment() done = original_frag.read_filepath(original_frag_file) # First build extended alignment on entire fragment set extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names()) dir = '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/' for a in [1, 2]: a = str(a)
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene is None): # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: list(fragments.keys())} # Now figure out direction of fragments binned_fragments = dict([(gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) print("Finding best orientation of reads\n") align_name = 'sate' if (options().genes == 'cogs'): align_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key + "_rev" in results): backward_score = results[key + "_rev"][1] if (backward_score > forward_score): frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) gene_set = marker_genes align_name = 'sate' if (options().genes == 'cogs'): gene_set = cog_genes align_name = 'pasta' for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%.profile' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(seq, gene_file + ".fixed") return genes
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = [ "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI", "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI", "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC", "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD", "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF", "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH", "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD", "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE", "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE", "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF", "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA", "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE", "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII" ] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp) cp1.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns( "data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns( "data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x < 0]) in2 = len([x for x in ext2._col_labels if x < 0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % ( extmerger.get_length(), in1, in2, tlen) assert (in1 + in2 + tlen - mixed) == extmerger.get_length( ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % ( extmerger.get_length(), in1, in2, tlen, mixed) assert (in1 + in2 - mixed) == len( list(extmerger.iter_insertion_columns()) ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % ( len(list(extmerger.iter_insertion_columns())), in1, in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k, s) in tmp.items() ]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
from sepp.alignment import MutableAlignment, ExtendedAlignment,_write_fasta from sepp.exhaustive import JoinAlignJobs, ExhaustiveAlgorithm from sepp.jobs import PplacerJob,MafftAlignJob,FastTreeJob,PastaAlignJob from sepp.filemgr import get_temp_file from sepp.config import options import sepp.config from sepp.math_utils import lcm from sepp.problem import SeppProblem from sepp.scheduler import JobPool from multiprocessing import Pool, Manager from sepp.alignment import ExtendedAlignment import glob job_joiner = JoinAlignJobs original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta' original_backbone = MutableAlignment() done = original_backbone.read_filepath(original_backbone_file) original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100' original_frag = MutableAlignment() done = original_frag.read_filepath(original_frag_file) #First build extended alignment on entire fragment set extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names()) dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/') dirs.reverse() for dir in dirs: print "Working on %s\n" % dir aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted([len(seq) for seq in sequences.values()]) lengths = len(seq_lengths) if lengths % 2: options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0 else: options().median_full_length = seq_lengths[lengths / 2] (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold))) frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in fragments.keys()] if (options().backbone_size is None): options().backbone_size = min(1000,int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) if (options().backbone_size > len(sequences.keys())): options().backbone_size = len(sequences.keys()) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) pastaalignJob.run() pastaalignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/pasta.fasta") options().tree_file = open(self.options.outdir + "/pasta.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): _LOG.info("No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta")) sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp) cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x<0]) in2 = len([x for x in ext2._col_labels if x<0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen) assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" %(extmerger.get_length(),in1, in2 , tlen, mixed) assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="