def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = SequenceCollection.from_fasta_records( [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA) infile = in_seqs.to_fasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set if len(removed_seqs) > 0: removed_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in removed_seqs], DNA) else: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?' ) output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.to_fasta()) output_file2.close() # write a fasta containing the filtered representative seqs if len(filtered_seqs) > 0: filtered_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in filtered_seqs], DNA) else: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?' ) output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.to_fasta()) output_file.close()
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={'Algorithm': 'first', 'ChoiceF': first_id}) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set if len(removed_seqs) > 0: removed_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in removed_seqs], DNA) else: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?') output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.to_fasta()) output_file2.close() # write a fasta containing the filtered representative seqs if len(filtered_seqs) > 0: filtered_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in filtered_seqs], DNA) else: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?') output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.to_fasta()) output_file.close()
def _qseq_to_sequence_collection(fh, constructor=BiologicalSequence, filter=_will_filter, phred_offset=_default_phred_offset, variant=_default_variant): return SequenceCollection(list(_qseq_to_generator( fh, constructor=constructor, filter=filter, phred_offset=phred_offset, variant=variant)))
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={ 'Algorithm': 'first', 'ChoiceF': first_id }) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def _fastq_to_sequence_collection(fh, variant=None, phred_offset=None, constructor=BiologicalSequence): return SequenceCollection( list( _fastq_to_generator(fh, variant=variant, phred_offset=phred_offset, constructor=constructor)))
def _fasta_to_sequence_collection(fh, qual=FileSentinel, constructor=Sequence, **kwargs): return SequenceCollection( list( _fasta_to_generator(fh, qual=qual, constructor=constructor, **kwargs)))
def _fastq_to_sequence_collection(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): return SequenceCollection( list( _fastq_to_generator(fh, variant=variant, phred_offset=phred_offset, constructor=constructor, **kwargs)))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_multiple_sequence_alignment(self): """Test multiple sequence alignment. """ seqs = [ DNA('caccggcggcccggtggtggccattattattgggtctaaag', id='seq_1'), DNA('caccggcggcccgagtggtggccattattattgggtcaagg', id='seq_2'), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'), DNA('caccgggcccgagtggtggccattattattgggtctaaag', id='seq_5') ] seqs_col = SequenceCollection(seqs) seqs_fp = join(self.working_dir, "seqs.fna") with open(seqs_fp, 'w') as o: o.write(seqs_col.to_fasta()) alignment = multiple_sequence_alignment(seqs_fp) align_exp = [ DNA('caccggcggcccg-gtggtggccattattattgggtctaaag', id='seq_1'), DNA('caccggcggcccgagtggtggccattattattgggtcaagg-', id='seq_2'), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'), DNA('caccg--ggcccgagtggtggccattattattgggtctaaag', id='seq_5') ] self.assertItemsEqual(alignment, align_exp)
def test_filter_aln_by_otus(self): """filter_aln_by_otus: determines which sequences to keep and which sequences to remove""" self.sample_to_extract = 'SampleA,SampleB' exp1 = [] exp1.append(('SampleA', 'AAAAAAAAAAAAAAA')) exp2 = [] exp2.append(('SampleB', 'CCCCCCC')) exp2.append(('SampleC', 'GGGGGGGGGGGGGG')) aln = SequenceCollection.from_fasta_records(self.aln, DNA) obs1, obs2 = filter_aln_by_otus(aln, self.prefs) self.assertEqual(obs1, exp1) self.assertEqual(obs2, exp2)
def main(): """opens files as necessary based on prefs""" option_parser, opts, args = parse_command_line_parameters(**script_info) data = {} fasta_file = opts.input_fasta_fp # load the input alignment data['aln'] = SequenceCollection.from_fasta_records( parse_fasta(open(fasta_file)), DNA) # Load the otu file otu_path = opts.otu_map_fp otu_f = open(otu_path, 'U') otus = fields_to_dict(otu_f) otu_f.close() data['otus'] = otus # Determine which which samples to extract from representative seqs # and from otus file if opts.samples_to_extract: prefs = process_extract_samples(opts.samples_to_extract) filepath = opts.input_fasta_fp filename = filepath.strip().split('/')[-1] filename = filename.split('.')[0] if opts.output_dir: if os.path.exists(opts.output_dir): dir_path = opts.output_dir else: try: os.mkdir(opts.output_dir) dir_path = opts.output_dir except OSError: pass else: dir_path = './' try: action = filter_samples except NameError: action = None # Place this outside try/except so we don't mask NameError in action if action: action(prefs, data, dir_path, filename)
def test_call_pynast_test1_file_output_alt_params(self): """PyNastAligner writes correct output files when no seqs align """ aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 1000}) actual = aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") self.assertEqual(getsize(self.result_fp), 0, "No alignable seqs should result in an empty file.") # all seqs reported to fail with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.sequence_count(), 3)
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fp : folder Output folder contains files including: a) The Newick formatted ghost-tree, which is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. b) Accession IDs from the ghost-tree.nwk file that you can use for downstream analyses tools c) log error file (this is an optional file that you can have if you type '--stderr') """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} std_output, std_error = "", "" process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() if re.search("command not found", std_error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() std_output, std_error = "", "" if re.search("command not found", std_error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") os.mkdir(ghost_tree_fp) extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree, all_std_error = _make_foundation_tree( ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error, ghost_tree_fp) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen( "muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n" mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w") ghost_tree_nwk.write(str(foundation_tree)) ghost_tree_nwk.close() _make_accession_id_file(ghost_tree_fp) return str(foundation_tree).strip(), all_std_error
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag """ blast_id_to_taxonomy = \ """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel. EF503699\tArchaea;Crenarchaeota;uncultured;uncultured DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium EF503697\tArchaea;Crenarchaeota;uncultured;uncultured""" blast_test_seqs = SequenceCollection.from_fasta_records([ ('s1', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'), ('s2', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'), ('s3', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'), ('s4', 'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'), ('s5', 'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'), ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG')], DNA) blast_reference_seqs = SequenceCollection.from_fasta_records([ ('AY800210', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'), ('EU883771', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'), ('EF503699', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'), ('DQ260310',
def _fasta_to_sequence_collection(fh, qual=FileSentinel, constructor=BiologicalSequence): return SequenceCollection( list(_fasta_to_generator(fh, qual=qual, constructor=constructor)))
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the ids to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
"""AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel. EF503699\tArchaea;Crenarchaeota;uncultured;uncultured DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium EF503697\tArchaea;Crenarchaeota;uncultured;uncultured""" blast_test_seqs = SequenceCollection.from_fasta_records([ ('s1', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC' ), ('s2', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA' ), ('s3', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG' ), ('s4', 'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT' ), ('s5', 'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC' ), ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG' ) ], DNA) blast_reference_seqs = SequenceCollection.from_fasta_records([ ('AY800210', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC' ), ('EU883771',
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} ghost_tree_output = str(ghost_tree_fp) ghost_tree_output = ghost_tree_output[16:-4] process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w") extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta", logfile) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() logfile.write("FastTree warnings for genus "+key_node+" are:\n" + error + "\n") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_fp.write(str(foundation_tree)) logfile.close() return str(foundation_tree).strip()
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ os.system("mkdir tmp") global foundation_accession_genus_dic foundation_accession_genus_dic = {} global seqs extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta") seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) os.system("muscle -in tmp/mini_seq_gt.fasta -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1") os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.children[:]) except: continue os.system("rm -r tmp") ghost_tree_fp.write(str(foundation_tree)) return str(foundation_tree).strip()