def test_get_representatives(self): """get_representatives should return the representatives as list of Sequence.""" result = """>1: 5 ABABABA >3: 1 BABA >4: 1 ABABAA >8: 2 BABBA """ seqs = self.data.iteritems mapping = self.mapping test_result = list(get_representatives(mapping, seqs())) test_result_as_fasta = "".join(map(lambda a: a.to_fasta(), test_result)) self.assertEqual(test_result_as_fasta, result) # another example mapping = {'1': ('a', 'b', 'c'), '2': ('d', 'e', 'f')} seqs = [('1', "ACGT"), ('2', "TAGC"), ('a', "TTTTT")] observed = list(get_representatives(mapping, seqs)) expected = [ BiologicalSequence("ACGT", id="1"), BiologicalSequence("TAGC", id='2') ] self.assertEqual(observed, expected)
def _compute_substitution_score(aln1_chars, aln2_chars, substitution_matrix, gap_substitution_score): substitution_score = 0 for aln1_char, aln2_char in product(aln1_chars, aln2_chars): if BiologicalSequence.is_gap(aln1_char) or\ BiologicalSequence.is_gap(aln2_char): substitution_score += gap_substitution_score else: try: substitution_score += \ substitution_matrix[aln1_char][aln2_char] except KeyError: offending_chars = \ [c for c in (aln1_char, aln2_char) if c not in substitution_matrix] raise ValueError( "One of the sequences contains a character that is " "not contained in the substitution matrix. Are you " "using an appropriate substitution matrix for your " "sequence type (e.g., a nucleotide substitution " "matrix does not make sense for aligning protein " "sequences)? Does your sequence contain invalid " "characters? The offending character(s) is: " " %s." % ', '.join(offending_chars)) substitution_score /= (len(aln1_chars) * len(aln2_chars)) return substitution_score
def MakeGeneraFastas(fin_taxonomy,fin_repset): global repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic fin_repset = open(fin_repset,"U") fin_taxonomy = open(fin_taxonomy,"U") repsetdic = {} for label, seq in parse_fasta(fin_repset,ignore_comment=True): repsetdic[label] = seq taxdic = {} taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus[0:3] == "g__": genus = genus[3:] taxgendic[accessionID] = genus taxdic[accessionID] = taxonomyline fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__"+genus+"_seqs.fasta","w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq,id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() cwd = os.getcwd() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repsetdic,taxdic,taxgendic,repsetIDlist,repgenlist,generaSeqIDdic
def test_filter_gap_high_entropy_low(self): result = filter_positions(self.alignment_with_gaps, self.maximum_gap_frequency_100, self.maximum_position_entropy_10) aln = Alignment([ BiologicalSequence('A-', id="seq1"), BiologicalSequence('A-', id="seq2"), BiologicalSequence('A-', id="seq3"), BiologicalSequence('A-', id="seq4") ]) self.assertEqual(result, aln)
def get_representatives(mapping, seqs): """Returns representative seqs. mapping: The prefix mapping dict seqs_fh: An open Fasta filehandle """ for (label, seq) in seqs: if(label in mapping): seq = BiologicalSequence( seq, id="%s: %d" % (label, len(mapping[label]) + 1)) yield seq.upper()
def _clustal_to_alignment(fh, strict=True): r"""yields labels and sequences from msa (multiple sequence alignment) Parameters ---------- fh : open file object An open Clustal file. strict : boolean Whether or not to raise a ``ClustalFormatError`` when no labels are found. Returns ------- skbio.Alignment Alignment object containing aligned biogical sequences Raises ------ skbio.util.exception.ClustalFormatError If the sequences in `fh` don't have the same sequence length or if the sequence ids don't properly match with the subsequences Notes ----- Skips any line that starts with a blank. ``_clustal_to_alignment`` preserves the order of the sequences from the original file. However, it does use a dict as an intermediate, so two sequences can't have the same label. This is probably OK since Clustal will refuse to run on a FASTA file in which two sequences have the same label, but could potentially cause trouble with manually edited files (all the segments of the conflicting sequences would be interleaved, possibly in an unpredictable way). If the lines have trailing numbers (i.e. Clustal was run with `-LINENOS=ON`), silently deletes them. Does not check that the numbers actually correspond to the number of chars in the sequence printed so far. References ---------- .. [1] Thompson JD, Higgins DG, Gibson TJ, "CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment through sequence weighting, position-specific gap penalties and weight matrix choice. Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80. """ records = map(_delete_trailing_number, filter(_is_clustal_seq_line, fh)) data, labels = _label_line_parser(records, last_space, strict) aligned_correctly = _check_length(data, labels) if not aligned_correctly: raise ClustalFormatError("Sequences not aligned properly") alns = [] for key in labels: alns.append(BiologicalSequence(id=key, sequence=''.join(data[key]))) return Alignment(alns)
def write_Fasta_from_name_seq_pairs(name_seqs, fh): """writes a list of (name,seqs) to filehandle. name_seqs: (name,seqs) pair such as from parse_fasta fh: an open filehandle """ if fh is None: raise ValueError("Need open file handle to write to.") for (name, seq) in name_seqs: fh.write("%s\n" % BiologicalSequence(seq, id=name).to_fasta())
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in parse_fasta(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
def _coerce_alignment_input_type(seq, disallow_alignment): """ Converts variety of types into an skbio.Alignment object """ if isinstance(seq, string_types): return Alignment([BiologicalSequence(seq)]) elif isinstance(seq, BiologicalSequence): return Alignment([seq]) elif isinstance(seq, Alignment): if disallow_alignment: # This will disallow aligning either a pair of alignments, or an # alignment and a sequence. We don't currently support this for # local alignment as there is not a clear usecase, and it's also # not exactly clear how this would work. raise TypeError("Aligning alignments is not currently supported " "with the aligner function that you're calling.") else: return seq else: raise TypeError( "Unsupported type provided to aligner: %r." % type(seq))
def pick_otus(file_path): outdir = os.path.join(os.path.dirname(file_path), 'uclust') if False: ## Making fasta format compatible with qiime (for some reason not working, assume user provides it) import skbio ## Im using scikit-bio for fasta I/O (comes with qiime) from skbio.sequence import BiologicalSequence print "Preprocessing FASTA " + file_path file_path1 = '%s_1%s' % tuple(os.path.splitext(file_path)) outfile = open(file_path1, "w") ## reformatting fasta fastafile = skbio.read(file_path, format='fasta') print "Reading " + file_path print "File handle: " + str(fastafile) for seqcount, rec in enumerate(fastafile): print seqcount + rec.__repr__() try: int( rec.id.split('_')[1] ) ## if the sequence adheres to qiime's expected format <sample_id>_<seq_counter> skbio.write(rec, 'fasta', outfile) ## write down the record as is except ValueError, IndexError: ## else: enforce an id format compatible with qiime's otu picker rec1 = BiologicalSequence(rec.sequence, "User_%05d" % seqcount) skbio.write(rec1, 'fasta', outfile) outfile.close() file_path = file_path1
def _traceback(traceback_matrix, score_matrix, aln1, aln2, start_row, start_col, gap_character='-'): # cache some values for simpler aend = _traceback_encoding['alignment-end'] match = _traceback_encoding['match'] vgap = _traceback_encoding['vertical-gap'] hgap = _traceback_encoding['horizontal-gap'] # initialize the result alignments aln1_sequence_count = aln1.sequence_count() aligned_seqs1 = [[] for e in range(aln1_sequence_count)] aln2_sequence_count = aln2.sequence_count() aligned_seqs2 = [[] for e in range(aln2_sequence_count)] current_row = start_row current_col = start_col best_score = score_matrix[current_row, current_col] current_value = None while current_value != aend: current_value = traceback_matrix[current_row, current_col] if current_value == match: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col-1])) for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row-1])) current_row -= 1 current_col -= 1 elif current_value == vgap: for aligned_seq in aligned_seqs1: aligned_seq.append('-') for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row-1])) current_row -= 1 elif current_value == hgap: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col-1])) for aligned_seq in aligned_seqs2: aligned_seq.append('-') current_col -= 1 elif current_value == aend: continue else: raise ValueError( "Invalid value in traceback matrix: %s" % current_value) for i in range(aln1_sequence_count): aligned_seq = ''.join(aligned_seqs1[i][::-1]) seq_id = _get_seq_id(aln1[i], str(i)) aligned_seqs1[i] = BiologicalSequence(aligned_seq, id=seq_id) for i in range(aln2_sequence_count): aligned_seq = ''.join(aligned_seqs2[i][::-1]) seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count)) aligned_seqs2[i] = BiologicalSequence(aligned_seq, id=seq_id) return (aligned_seqs1, aligned_seqs2, best_score, current_col, current_row)
def make_genera_fastas(fin_taxonomy, fin_repset): """Takes ITS fasta file representative sequences and sorts the OTUs/species into their corresponding genus file. This allows OTUs to be compared to other OTUs from the same genus. Parameters ---------- repsetdic : dict A dictionary containing the label (key) and sequence (value) from ITS representative sequences file. repgenlist : list A list that contains all unique genera from ITS fasta file. taxgendic: dict A dictionary containing accession ID (key) and genus only from the Unite taxonomy file. ***** not used currently repsetIDlist : list A list that contains all of the IDs from the representative ITS sequences. Returns ---------- Examples ---------- Input is a representative sequence fasta file where each sequence corresponds to one representative for all of the OTUs in each cluster. Each sequence has an accession ID that corresponds to one sequence in the Unite database. Example of one representative fasta sequence from the input fasta file: >>AB015922 Some_comment_ie_sample_location CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA CACGGGTGGGAGGGTCGACCCAGCACGC Example of a taxonomy line: AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes; o__Thelephorales;f__Thelephoraceae;g__Thelephora; s__Thelephora_terrestris This code identifies the genus of all OTUs by looking at the accession number from the fasta sequence, then looking at the Unite taxonomy file and identifying the genus the sequence belongs to. The OTUs then get sorted into genus files that have one or more OTUs/species per file. """ global repgenlist fin_repset = open(fin_repset, "U") fin_taxonomy = open(fin_taxonomy, "U") repsetdic = {} for label, seq in parse_fasta(fin_repset, ignore_comment=True): repsetdic[label] = seq taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus.startswith("g__"): genus = genus[3:] taxgendic[accessionID] = genus fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__" + genus + "_seqs.fasta", "w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq, id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repgenlist
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True): """Returns a FASTA string given an alignment object .. note:: Deprecated in scikit-bio 0.2.0-dev ``fasta_from_alignment`` will be removed in scikit-bio 0.3.0. It is replaced by ``write``, which is a more general method for serializing FASTA-formatted files. ``write`` supports multiple file formats by taking advantage of scikit-bio's I/O registry system. See :mod:`skbio.io` for more details. Parameters ---------- aln : Alignment, dict alignment or dictionary where the keys are the sequence ids and the values are the sequences themselves. make_seqlabel : function, optional callback function that takes the seq object and returns a label ``str``. If ``None`` is passed, the following attributes will try to be retrieved in this order and the first to exist will be used: ``id``, ``Label`` or ``Name``. In any other case an integer with the position of the sequence object will be used. line_wrap : int, optional line_wrap: a integer for maximum line width, if ``None`` is passed the full sequence will be used. sort : bool, optional Whether or not the sequences should be sorted by their sequence id, default value is ``True``. Returns ------- str FASTA formatted string composed of the objects passed in via `seqs`. See Also -------- skbio.parse.sequences.parse_fasta skbio.alignment.Alignment Examples -------- Formatting a sequence alignment object into a FASTA file. >>> from skbio.alignment import Alignment >>> from skbio.sequence import DNA >>> from skbio.format.sequences import fasta_from_alignment >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"), ... DNA("TCC--G-GGCA..", id="seqs2")] >>> a1 = Alignment(seqs) >>> print fasta_from_alignment(a1) >seq1 ACC--G-GGTA.. >seqs2 TCC--G-GGCA.. """ warnings.warn( "`fasta_from_alignment` is deprecated and will be removed in " "scikit-bio 0.3.0. Please update your code to use `skbio.io.write` " "or `skbio.Alignment.write`.", DeprecationWarning) # check if it's an Alignment object or a dictionary if isinstance(aln, Alignment): order = aln.ids() else: order = aln.keys() if sort: order = sorted(order) ordered_seqs = [] for label in order: seq = aln[label] if isinstance(seq, str): seq = BiologicalSequence(seq, label) ordered_seqs.append(seq) return fasta_from_sequences(ordered_seqs, make_seqlabel=make_seqlabel, line_wrap=line_wrap)
def make_genera_fastas(fin_taxonomy,fin_repset): """Takes ITS fasta file representative sequences and sorts the OTUs/species into their corresponding genus file. This allows OTUs to be compared to other OTUs from the same genus. Parameters ---------- repsetdic : dict A dictionary containing the label (key) and sequence (value) from ITS representative sequences file. repgenlist : list A list that contains all unique genera from ITS fasta file. taxgendic: dict A dictionary containing accession ID (key) and genus only from the Unite taxonomy file. ***** not used currently repsetIDlist : list A list that contains all of the IDs from the representative ITS sequences. Returns ---------- Examples ---------- Input is a representative sequence fasta file where each sequence corresponds to one representative for all of the OTUs in each cluster. Each sequence has an accession ID that corresponds to one sequence in the Unite database. Example of one representative fasta sequence from the input fasta file: >>AB015922 Some_comment_ie_sample_location CAGAGCCAAGAGATCCGTTGTTGAAAGTTTTTTCAATTCAAGAATAAAACTTAGACTGCAAAG ACAACATGAGTTTGGTTTGGGTCTTTGGCGGACACGCTCCAGCCGAAGCCGGTGGGCGGCCGA CGCCAGTCCTCACGAACAGCGCCGACGTAGCCCGGCCCGCCAAAGCAACAAGATATAAATCGA CACGGGTGGGAGGGTCGACCCAGCACGC Example of a taxonomy line: AY880934 k__Fungi;p__Basidiomycota;c__Agaricomycetes; o__Thelephorales;f__Thelephoraceae;g__Thelephora; s__Thelephora_terrestris This code identifies the genus of all OTUs by looking at the accession number from the fasta sequence, then looking at the Unite taxonomy file and identifying the genus the sequence belongs to. The OTUs then get sorted into genus files that have one or more OTUs/species per file. """ global repgenlist fin_repset = open(fin_repset,"U") fin_taxonomy = open(fin_taxonomy,"U") repsetdic = {} for label, seq in parse_fasta(fin_repset,ignore_comment=True): repsetdic[label] = seq taxgendic = {} for line in fin_taxonomy: line = line.split("\t") accessionID = line[0] taxonomyline = line[1] genus = taxonomyline.split(";") genus = genus[-2] if genus.startswith("g__"): genus = genus[3:] taxgendic[accessionID] = genus fin_taxonomy.close() fin_repset.close() repsetIDlist = [] repsetIDlist = repsetdic.keys() repgenlist = [] for i in repsetIDlist: genus = taxgendic[i] if genus not in repgenlist: repgenlist.append(genus) generaSeqIDdic = {} for m in repgenlist: IDnumlist = [] generaSeqIDdic[m] = IDnumlist for key in taxgendic: if key in repsetIDlist: try: g = taxgendic[key] generaSeqIDdic[g].append(key) except: continue from skbio.sequence import BiologicalSequence for genus in generaSeqIDdic: fout = open("g__"+genus+"_seqs.fasta","w") seqlist = [] seqlist = generaSeqIDdic[genus] for i in seqlist: seq = repsetdic[i] t = BiologicalSequence(seq,id=i) line = (t.to_fasta(terminal_character="")) fout.write(line) fout.write("\n") fout.close() for file in os.listdir(cwd): if os.path.getsize(file) < 1: os.remove(file) return repgenlist
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True): """Returns a FASTA string given an alignment object Parameters ---------- aln : Alignment, dict alignment or dictionary where the keys are the sequence ids and the values are the sequences themselves. make_seqlabel : function, optional callback function that takes the seq object and returns a label ``str``. If ``None`` is passed, the following attributes will try to be retrieved in this order and the first to exist will be used: ``id``, ``Label`` or ``Name``. In any other case an integer with the position of the sequence object will be used. line_wrap : int, optional line_wrap: a integer for maximum line width, if ``None`` is passed the full sequence will be used. sort : bool, optional Whether or not the sequences should be sorted by their sequence id, default value is ``True``. Returns ------- str FASTA formatted string composed of the objects passed in via `seqs`. See Also -------- skbio.parse.sequences.parse_fasta skbio.alignment.Alignment Examples -------- Formatting a sequence alignment object into a FASTA file. >>> from skbio.alignment import Alignment >>> from skbio.sequence import DNA >>> from skbio.format.sequences import fasta_from_alignment >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"), ... DNA("TCC--G-GGCA..", id="seqs2")] >>> a1 = Alignment(seqs) >>> print fasta_from_alignment(a1) >seq1 ACC--G-GGTA.. >seqs2 TCC--G-GGCA.. """ # check if it's an Alignment object or a dictionary if isinstance(aln, Alignment): order = aln.ids() else: order = aln.keys() if sort: order = sorted(order) ordered_seqs = [] for label in order: seq = aln[label] if isinstance(seq, str): seq = BiologicalSequence(seq, label) ordered_seqs.append(seq) return fasta_from_sequences(ordered_seqs, make_seqlabel=make_seqlabel, line_wrap=line_wrap)
def hamming_distance(s1, s2): s1 = BiologicalSequence(s1) s2 = BiologicalSequence(s2) return s1.distance(s2)