def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. - Mafft profile alignment only works with aligned sequences. Alignment object used to handle unaligned sequences. params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_paths',\ params=params, SuppressStderr=False) app._command = 'mafft-profile' aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta()) aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta()) filepaths = [aln1_path,aln2_path] #Get results using int_map as input to app res = app(filepaths) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[aln1_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(aln1_path) remove(aln2_path) remove('pre') remove('trace') del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Clustal app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-profile'].on() #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln1_int_map.toFasta())) #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(aln2_int_map.toFasta())) #Get results using int_map as input to app res = app() #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[aln1_int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['-profile1'].Value) remove(app.Parameters['-profile2'].Value) del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. - Mafft profile alignment only works with aligned sequences. Alignment object used to handle unaligned sequences. params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_paths',\ params=params, SuppressStderr=False) app._command = 'mafft-profile' aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta()) aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta()) filepaths = [aln1_path,aln2_path] #Get results using int_map as input to app res = app(filepaths) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[aln1_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(aln1_path) remove(aln2_path) remove('pre') remove('trace') del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) # create Alignment object from aln aln = Alignment(aln, MolType=moltype) # Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_") # Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) # Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) # Create Mafft app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True) app.Parameters["-align"].off() app.Parameters["-infile"].off() app.Parameters["-sequences"].on() # Add aln_int_map as profile1 app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta())) # Add seq_int_map as profile2 app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta())) # Get results using int_map as input to app res = app() # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() remove(app.Parameters["-profile1"].Value) remove(app.Parameters["-profile2"].Value) del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment) return new_alignment
def insert_sequences_into_tree(aln, moltype, params={}, write_log=True): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ # convert aln to phy since seq_names need fixed to run through pplacer new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() ih = '_input_as_multiline_string' pplacer_app = Pplacer(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=False, SuppressStdout=False) pplacer_result = pplacer_app(seqs) # write a log file if write_log: log_fp = join(params["--out-dir"],'log_pplacer_' + \ split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(pplacer_result['StdOut'].read()) log_file.close() # use guppy to convert json file into a placement tree guppy_params={'tog':None} new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \ output_dir=params['--out-dir'], \ params=guppy_params) pplacer_result.cleanUp() return new_tree
def insert_sequences_into_tree(aln, moltype, params={}): """Returns a tree from placement of sequences """ # convert aln to phy since seq_names need fixed to run through parsinsert new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() parsinsert_app = ParsInsert(params=params) result = parsinsert_app(seqs) # parse tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # cleanup files result.cleanUp() return tree
def insert_sequences_into_tree(aln, moltype, params={}): """Returns a tree from placement of sequences """ # convert aln to phy since seq_names need fixed to run through parsinsert new_aln = get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() parsinsert_app = ParsInsert(params=params) result = parsinsert_app(seqs) # parse tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # cleanup files result.cleanUp() return tree
def __call__(self, seq_path, result_path=None, log_path=None, \ failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open(\ self.Params['template_filepath'],'U'),\ seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner." moltype = self.Params['moltype'] #Need to make separate mapping for unaligned sequences unaligned = SequenceCollection(candidate_sequences,MolType=moltype) int_map, int_keys = unaligned.getIntMap(prefix='unaligned_') int_map = SequenceCollection(int_map,MolType=moltype) #Turn on --gapthresh option in cmbuild to force alignment to full model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh':1.0}) #record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) #Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. #Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub':True,'--gapthresh':1.0}) #record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) #Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\ structure_string=struct,\ seqs=int_map,\ moltype=moltype,\ include_aln=True,\ params=cmalign_params,\ cmbuild_params=cmbuild_params) #Pull out original sequences from full alignment. infernal_aligned={} aligned_dict = aligned.NamedSeqs for key in int_map.Names: infernal_aligned[int_keys.get(key,key)]=aligned_dict[key] #Create an Alignment object from alignment dict infernal_aligned = Alignment(infernal_aligned,MolType=moltype) if log_path is not None: log_file = open(log_path,'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path,'w') result_file.write(infernal_aligned.toFasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) #create Alignment object from aln aln = Alignment(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-sequences'].on() #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(seq_int_map.toFasta())) #Get results using int_map as input to app res = app() #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['-profile1'].Value) remove(app.Parameters['-profile2'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln,MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_aln, app, result, int_map, int_keys, params) return tree
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
#pycogent module from cogent.core.alignment import Alignment fasta_file = open('H**o.fasta') ali = Alignment(fasta_file.read()) print(ali.toFasta()) for column in ali.iterPositions(): gap_fraction = float(column.count('-')) / len(column) print('%4.2f' % gap_fraction, ) print()
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln, MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_aln, app, result, int_map, int_keys, params) return tree