def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." %(pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None] _LOG.info("Merging fragment chunks for subalignment : %s." %(ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns\ (ap.jobs["hmmbuild"].infile , aligned_files) _LOG.info("Merging alignment subset into placement subset: %s." %(ap.label)) extendedAlignment.merge_in(ap_alg,convert_to_string=False) del ap_alg extendedAlignment.from_bytearray_to_string() return extendedAlignment
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) frags = [] for ap in pp.get_children(): frags.extend(ap.fragments) pp.fragments.seq_names.update(frags) ''' Then Build an extended alignment by merging all hmmalign results''' extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.info("Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns\ (ap.jobs["hmmbuild"].infile , aligned_files) _LOG.info("Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) del ap_alg extendedAlignment.from_bytearray_to_string() return extendedAlignment
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First find fragments assigned to this placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then, gather a list of all alignments relevant to this placement subset''' fragfilesperap = dict() for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children] fragfilesperap[ap] = aligned_files ''' Now, build an extended alignment *per each fragment chunk*. Simply merge all hmmalign results for fragment chunk numbered i''' extendedAlignments = [] for i in range(0, self.root_problem.fragment_chunks): extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: # _LOG.debug("Merging fragment chunks for subalignment : %s." # %(ap.label)) if fragfilesperap[ap][i]: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]]) else: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, []) _LOG.debug( ("Merging alignment subset into placement subset for " "chunk %d: %s.") % (i, ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) '''Extended alignmnts have all fragments. remove the ones that don't belong to thsi chunk''' extendedAlignment.remove_missing_fragments() extendedAlignment.from_bytearray_to_string() extendedAlignments.append(extendedAlignment) return extendedAlignments
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First find fragments assigned to this placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then, gather a list of all alignments relevant to this placement subset''' fragfilesperap = dict() for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children ] fragfilesperap[ap] = aligned_files ''' Now, build an extended alignment *per each fragment chunk*. Simply merge all hmmalign results for fragment chunk numbered i''' extendedAlignments = [] for i in range(0, self.root_problem.fragment_chunks): extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: # _LOG.debug("Merging fragment chunks for subalignment : %s." # %(ap.label)) if fragfilesperap[ap][i]: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]]) else: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, []) _LOG.debug( ("Merging alignment subset into placement subset for " "chunk %d: %s.") % (i, ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) '''Extended alignmnts have all fragments. remove the ones that don't belong to thsi chunk''' extendedAlignment.remove_missing_fragments() extendedAlignment.from_bytearray_to_string() extendedAlignments.append(extendedAlignment) return extendedAlignments
def merge_results(self): assert isinstance(self.root_problem, SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): align_input = open( pp.jobs[get_placement_job_name(i)] .full_extended_alignment_file, 'rb') extended_alignment = pickle.load(align_input) align_input.close() fullExtendedAlignment.merge_in( extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment mergeinput = [] '''Append main tree to merge input''' mergeinput.append("%s;" % ( self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name( get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % ( pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) mergeJsonJob = self.get_merge_job(meregeinputstring) mergeJsonJob.run()
def merge_results(self): assert isinstance(self.root_problem, RootProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): align_input = open( pp.jobs[get_placement_job_name(i)] .full_extended_alignment_file, 'rb') extended_alignment = pickle.load(align_input) align_input.close() fullExtendedAlignment.merge_in( extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment mergeinput = [] '''Append main tree to merge input''' mergeinput.append("%s;" % ( self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name( get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % ( pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) merge_json_job = self.get_merge_job(meregeinputstring) merge_json_job.run()
def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info( "Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug( "fragments are %d:\n %s" % ( len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None] _LOG.debug( "Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug( "Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = [ "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI", "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI", "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC", "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD", "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF", "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH", "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD", "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE", "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE", "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF", "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA", "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE", "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII" ] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp) cp1.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns( "data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns( "data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x < 0]) in2 = len([x for x in ext2._col_labels if x < 0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % ( extmerger.get_length(), in1, in2, tlen) assert (in1 + in2 + tlen - mixed) == extmerger.get_length( ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % ( extmerger.get_length(), in1, in2, tlen, mixed) assert (in1 + in2 - mixed) == len( list(extmerger.iter_insertion_columns()) ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % ( len(list(extmerger.iter_insertion_columns())), in1, in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k, s) in tmp.items() ]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
for dir in dirs: print "Working on %s\n" % dir aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir) sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir) base_alignment_file = glob.glob('%s/*.fasta' % dir) base_alignment = MutableAlignment() done = base_alignment.read_filepath(base_alignment_file[0]) subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names()) frags = MutableAlignment() sequence_names = [] for file in sequence_files: seq = MutableAlignment() done = seq.read_filepath(file) done = sequence_names.extend(seq.get_sequence_names()) for name, seq in seq.iteritems(): frags[name] = seq.upper() problem = SeppProblem(sequence_names) problem.set_subalignment(subbackbone) mut_subalg = problem.subalignment.get_mutable_alignment() remaining_cols = mut_subalg.delete_all_gap() problem.annotations["ref.alignment.columns"] = remaining_cols problem.fragments = frags ap_alg = problem.read_extendend_alignment_and_relabel_columns\ (base_alignment_file, aligned_files) extendedAlignment.merge_in(ap_alg,convert_to_string=False) extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.unmasked.fasta") extendedAlignment.remove_insertion_columns() extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.masked.fasta")
def merge_results(self): assert isinstance(self.root_problem, SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): extended_alignment = pp.jobs[get_placement_job_name( i)].get_attribute("full_extended_alignment_object") fullExtendedAlignment.merge_in(extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment # IF only one placement subset, no need to go to java if len(self.root_problem.get_children()) == 1: import json mergeinput = [] for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' with open( pp.get_job_result_by_name( get_placement_job_name(i))) as f: mergeinput.append(json.load(f)) _LOG.info( "There are %d fragment chunks on a single placement subset" % len(mergeinput)) result = mergeinput[0] for i in range(1, len(mergeinput)): result["placements"] = result["placements"] + mergeinput[i][ "placements"] with open(self.get_output_filename("placement.json"), 'w') as f: json.dump(result, f, sort_keys=True, indent=4) else: mergeinput = [] '''Append main tree to merge input''' mergeinput.append( "%s;" % (self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % (pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) _LOG.debug(mergeinput) mergeJsonJob = MergeJsonJob() mergeJsonJob.setup(meregeinputstring, self.get_output_filename("placement.json")) mergeJsonJob.run()
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp) cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x<0]) in2 = len([x for x in ext2._col_labels if x<0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen) assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" %(extmerger.get_length(),in1, in2 , tlen, mixed) assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="