def test_arg_matrix_overrides_match_and_mismatch(self): query_sequences = [ "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA", "AGTCGAAGGGTAATA", "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", "AGGGTAATTAGCGCGTGTTCACCTA" ] target_sequences = query_sequences matrix = { # This is a biologically meaningless matrix "A": {"A": 4, "T": -1, "C": -2, "G": -3, "N": 4}, "T": {"A": -1, "T": 1, "C": -1, "G": -4, "N": 1}, "C": {"A": -2, "T": -1, "C": 10, "G": 1, "N": 1}, "G": {"A": -3, "T": -4, "C": 1, "G": 3, "N": 1}, "N": {"A": 4, "T": 1, "C": 1, "G": 1, "N": 0} } for query_sequence in query_sequences: for target_sequence in target_sequences: query1 = StripedSmithWaterman(query_sequence) align1 = query1(target_sequence) query2 = StripedSmithWaterman(query_sequence, substitution_matrix=matrix) align2 = query2(target_sequence) self.assertNotEqual(align1.optimal_alignment_score, align2.optimal_alignment_score)
def getSimilarityScoreTwoProteinLocalAlignText(proteinA: str, proteinB: str): """ | Get similarity score between two proteins with local alignment. :param proteinA: protein A :param proteinB: protein B to compare to protein A :type proteinA: ProteinJson :type phage_B: ProteinJson :return: similarity score between two phage :rtype: float16 """ # using optimized Smit-Waterman query = StripedSmithWaterman(proteinA, protein=True, substitution_matrix=constants.MATRIX_BLOSUM62, gap_open_penalty=10, gap_extend_penalty=1, score_only=True) queryscoremax = StripedSmithWaterman( proteinA, protein=True, substitution_matrix=constants.MATRIX_BLOSUM62, gap_open_penalty=10, gap_extend_penalty=1, score_only=True) aligner_score = query(proteinB) aligner_score_max = queryscoremax(proteinA) # get scores from the return of query functions score = int(str(aligner_score).split(" ")[1]) score_max = int(str(aligner_score_max).split(" ")[1]) return score / score_max
def _check_argument_with_inequality_on_optimal_align_score( self, query_sequences=None, target_sequences=None, arg=None, default=None, i_range=None, compare_lt=None, compare_gt=None): iterable_kwarg = {} default_kwarg = {} default_kwarg[arg] = default for query_sequence in query_sequences: for target_sequence in target_sequences: for i in i_range: iterable_kwarg[arg] = i query1 = StripedSmithWaterman(query_sequence, **iterable_kwarg) align1 = query1(target_sequence) query2 = StripedSmithWaterman(query_sequence, **default_kwarg) align2 = query2(target_sequence) if i == default: self.assertEqual(align1.optimal_alignment_score, align2.optimal_alignment_score) if i < default: compare_lt(align1.optimal_alignment_score, align2.optimal_alignment_score) if i > default: compare_gt(align1.optimal_alignment_score, align2.optimal_alignment_score)
def genotype_bam(self, bam_file, strain, read_length = 100, add = False, troubleshooting = False): unique_reads = {} ref_seq = self.ret_ref_allele(read_length) alt_seq = self.ret_alt_allele(read_length) if troubleshooting != False: print(ref_seq, file = troubleshooting) print(alt_seq, file = troubleshooting) reads1 = bam_file.fetch(self.chrom, max(0,self.start-10), self.start + 10) for read in reads1: if read.pos - 5 < self.start and read.pos + len(read.seq) > self.start: unique_reads[read.qname] = read reads2 = bam_file.fetch(self.chrom, max(0,self.stop-10), self.stop + 10) for read in reads2: if read.pos -5 < self.stop and read.pos + len(read.seq) > self.stop: unique_reads[read.qname] = read geno = [0,0,0] ref_SSW = StripedSmithWaterman(ref_seq) alt_SSW = StripedSmithWaterman(alt_seq) for read in unique_reads.values(): if not read.is_secondary and read.mapq > 10: ref_score = max(ref_SSW(read.seq).optimal_alignment_score, ref_SSW(reverse_complement(read.seq)).optimal_alignment_score) alt_score = max(alt_SSW(read.seq).optimal_alignment_score, alt_SSW(reverse_complement(read.seq)).optimal_alignment_score) if ref_score - alt_score > 10: geno[0]+=1 elif ref_score - alt_score < -10: geno[1]+=1 else: geno[2]+=1 if troubleshooting != False: print(read.seq + '\t' + str(ref_SSW(read.seq).optimal_alignment_score) + '\t' + str(alt_SSW(read.seq).optimal_alignment_score), file = troubleshooting) print(reverse_complement(read.seq) + '\t' + str(ref_SSW(reverse_complement(read.seq)).optimal_alignment_score) + '\t' + str(alt_SSW(reverse_complement(read.seq)).optimal_alignment_score), file = troubleshooting) if len(unique_reads) == 0: geno_o = ('./.',tuple(geno)) if geno[0] == 0 and geno[1] == 0: geno_o = ('./.',tuple(geno)) elif geno[0] > 4*geno[1]: geno_o = ('0/0',tuple(geno)) elif geno[0] < 1/4*geno[1]: geno_o = ('1/1',tuple(geno)) self.empty = False else: geno_o = ('0/1',tuple(geno)) self.empty = False if add == True: self.geno[strain] = geno_o return geno_o
def test_arg_gap_extend_penalty(self): query_sequences = [ "TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT", "AGTCGAAGGGTAATACTAGGCGTGTCACCTA", "AGTCGAAGGGTAATA", "CTGCCTCAGGGGGAGGCAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC", "AGGGTAATTAGGCGTGTTCACCTA" ] target_sequences = query_sequences self._check_argument_with_inequality_on_optimal_align_score( query_sequences=query_sequences, target_sequences=target_sequences, arg='gap_extend_penalty', default=2, i_range=range(1, 10), # These are intentionally inverted compare_lt=self.assertGreaterEqual, compare_gt=self.assertLessEqual) # The above is not a strict bound, so lets use an expected align # to plug the hole where every align is exactly equal to default expected = { 'optimal_alignment_score': 9, 'suboptimal_alignment_score': 8, 'query_begin': 6, 'query_end': 12, 'target_begin': 7, 'target_end_optimal': 13, 'target_end_suboptimal': 38, 'cigar': '7M', 'query_sequence': 'TCTATAAGATTCCGCATGCGTTACTTATAAGATGTCTCAACGG', 'target_sequence': 'GCCCAGTAGCTTCCCAATATGAGAGCATCAATTGTAGATCGGGCC' } query = StripedSmithWaterman(expected['query_sequence'], gap_extend_penalty=10) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def _check_bit_flag_sets_properties_falsy_or_negative( self, query_sequences=None, target_sequences=None, arg_settings=[], properties_to_null=[]): kwarg = {} def falsy_or_negative(alignment, prop): if type(alignment[prop]) is int: return alignment[prop] < 0 else: return not alignment[prop] for query_sequence in query_sequences: for target_sequence in target_sequences: for arg, setting in arg_settings: kwarg[arg] = setting query = StripedSmithWaterman(query_sequence, **kwarg) alignment = query(target_sequence) for prop in properties_to_null: self.assertTrue(falsy_or_negative(alignment, prop)) # Every property not in our null list for prop in [p for p in self.align_attributes if p not in properties_to_null]: self.assertFalse(falsy_or_negative(alignment, prop))
def test_regression_on_instantiation_arguments(self): expected = { 'optimal_alignment_score': 23, 'suboptimal_alignment_score': 10, 'query_begin': 0, 'query_end': 16, 'target_begin': 0, 'target_end_optimal': 20, 'target_end_suboptimal': 4, 'cigar': '6M4D11M', 'query_sequence': 'AAACGATAAATCCGCGTA', 'target_sequence': 'AAACGACTACTAAATCCGCGTGATAGGGGA' } query = StripedSmithWaterman(expected['query_sequence'], gap_open_penalty=5, gap_extend_penalty=2, score_size=2, mask_length=15, mask_auto=True, score_only=False, score_filter=None, distance_filter=None, override_skip_babp=False, protein=False, match_score=2, mismatch_score=-3, substitution_matrix=None, suppress_sequences=False, zero_index=True) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def test_protein_sequence_is_usable(self): expected = { 'optimal_alignment_score': 316, 'suboptimal_alignment_score': 95, 'query_begin': 0, 'query_end': 52, 'target_begin': 0, 'target_end_optimal': 52, 'target_end_suboptimal': 18, 'cigar': '15M1D15M1I22M', 'query_sequence': ('VHLTGEEKSAVAALWGKVNVDEVGGEALGRXLLVVYPWTQRFFESF' 'SDLSTPDABVMSNPKVKAHGK'), 'target_sequence': ('VHLTPEEKSAVTALWBGKVNVDEVGGEALGRLLVVYPWTQRFFES' 'FGDLSTPD*') } query = StripedSmithWaterman(expected['query_sequence'], protein=True, substitution_matrix=blosum50) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def test_works_for_dot_and_square_bracket_access(self): q_seq = "AGGGTAATTAGGCGTGTTCACCTA" query = StripedSmithWaterman(q_seq) alignment = query("TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG") for accessible in self.align_attributes: self.assertEqual(getattr(alignment, accessible), alignment[accessible])
def test_aligned_query_target_sequence(self): query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA") alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA") self.assertEqual("AGGGTAATATAGGCGT-GTCACCTA", alignment.aligned_target_sequence) self.assertEqual("AGGGTAAT-TAGGCGTGTTCACCTA", alignment.aligned_query_sequence)
def get_brute_force_mapping(A_kmer_list, B_kmer_list): mapping_list = [] for a_kmer in A_kmer_list: sw_score_generator = StripedSmithWaterman(a_kmer, match_score=1, mismatch_score=-1, gap_open_penalty=1, gap_extend_penalty=1, mask_length=0) max_score = 0 max_score_b_kmer = "" for b_kmer in B_kmer_list: sw_score = sw_score_generator(b_kmer)['optimal_alignment_score'] if sw_score >= SW_SCORE_THRESHOLD and sw_score > max_score: max_score = sw_score max_score_b_kmer = b_kmer if max_score != 0: mapping_list.append((a_kmer, max_score, max_score_b_kmer)) return mapping_list
def test_same_as_using_StripedSmithWaterman_object(self): query_sequence = 'ATGGAAGCTATAAGCGCGGGTGAG' target_sequence = 'AACTTATATAATAAAAATTATATATTCGTTGGGTTCTTTTGATATAAATC' query = StripedSmithWaterman(query_sequence) align1 = query(target_sequence) align2 = local_pairwise_align_ssw(query_sequence, target_sequence) self._check_Alignment_to_AlignmentStructure(align2, align1)
def within_cluster_analysis(clus_id, labels, umis, maxiter=200): #Get a list of the actual read ID for reads belonging to the given cluster read_ids = list(range(len(labels))) cluster_members = [i for i in read_ids if labels[i] == clus_id] if len( cluster_members ) <= 1: #if the threshold was so high that no clusters are formed I'll return a 0 as score for now. return 0.0, len(cluster_members) else: #Calculate alignment score for all combinations within this cluster #but not all all combinations, maximally for the first 100 sequences in the cluster, otherwise the calculations are too long if len(cluster_members) < maxiter: maxiter = len(cluster_members) count = 0 total_score = 0 for i in range(maxiter): #Align: Doesn't need to be multicore, as there are only few alignments to be done (nr = maxiter) #Main calculation burden is in the clustering, which already is multiprocessed query = StripedSmithWaterman(str(umis[cluster_members[i]].seq)) for j in range(i + 1, maxiter): aln = query(str(umis[cluster_members[j]].seq)) score = aln.optimal_alignment_score total_score += score count += 1 #print("Score: %.2f" % (total_score/count)) return total_score / count, len(cluster_members)
def fasta_metric(s, S): query = StripedSmithWaterman(s, protein=True, substitution_matrix=subs_mat( MatrixInfo.pam250)) scores = [] for t in S: scores.append(query(t)['optimal_alignment_score']) return scores
def _align(self): aligner = StripedSmithWaterman(self.query.sequence, match_score=self._match, mismatch_score=self._mismatch, gap_open_penalty=self._gap_open, gap_extend_penalty=self._gap_extend, substitution_matrix=self._matrix, protein=self._aa) return aligner(self.target.sequence)
def align_to_reference_sequence(self, reference_sequence, n_base_pairs=None): if n_base_pairs is None: n_base_pairs = len(reference_sequence) ##logging.debug("Aligning %d bp from %s to %s" % (n_base_pairs, self.read_sequence, reference_sequence)) s = StripedSmithWaterman(reference_sequence[0:n_base_pairs], score_only=True) alignment = s(self.read_sequence[0:n_base_pairs]) return alignment["optimal_alignment_score"]
def test_kwargs_are_usable(self): kwargs = {} kwargs['mismatch_score'] = -2 kwargs['match_score'] = 5 query_sequence = 'AGGGTAATTAGGCGTGTTCACCTA' target_sequence = 'TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG' query = StripedSmithWaterman(query_sequence, **kwargs) align1 = query(target_sequence) align2 = local_pairwise_align_ssw(query_sequence, target_sequence, **kwargs) self._check_Alignment_to_AlignmentStructure(align2, align1)
def test_same_as_using_StripedSmithWaterman_object_Protein(self): query_sequence = 'HEAGAWGHEE' target_sequence = 'PAWHEAE' query = StripedSmithWaterman(query_sequence, protein=True, substitution_matrix=blosum50) align1 = query(target_sequence) align2 = local_pairwise_align_ssw(Protein(query_sequence), Protein(target_sequence), substitution_matrix=blosum50) self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein)
def _fasta_similarity_func_sym(seq): norm = seq_norm(seq) query = StripedSmithWaterman(seq, protein=True, substitution_matrix=substitution_data) def similarity(_seq_): norm_ = seq_norm(_seq_) return query(_seq_).optimal_alignment_score / np.sqrt(norm * norm_) return np.vectorize(similarity)
def test_arg_zero_index_changes_base_of_index_to_0_or_1(self): expected_alignments = [ ({ 'optimal_alignment_score': 100, 'suboptimal_alignment_score': 44, 'query_begin': 5, 'query_end': 54, 'target_begin': 0, 'target_end_optimal': 49, 'target_end_suboptimal': 21, 'cigar': '50M', 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 'CCCCGGGCGGGGC'), 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 'GGGCGGGGC') }, True), ({ 'optimal_alignment_score': 100, 'suboptimal_alignment_score': 44, 'query_begin': 6, 'query_end': 55, 'target_begin': 1, 'target_end_optimal': 50, 'target_end_suboptimal': 22, 'cigar': '50M', 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 'CCCCGGGCGGGGC'), 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 'GGGCGGGGC') }, False) ] for expected, z in expected_alignments: query = StripedSmithWaterman(expected['query_sequence'], zero_index=z) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def ssw_search_contig(outfile, seq_identity, queries, targets): targets = list(SeqIO.parse(targets, 'fasta')) for qrec in SeqIO.parse(queries, 'fasta'): aligner = StripedSmithWaterman(str(qrec.seq)) for trec in targets: alignment = aligner(str(trec.seq)) aln_len = len(alignment.aligned_query_sequence) aln_score = alignment.optimal_alignment_score aln_score /= (2 * aln_len) aln_score *= 100 if seq_identity <= aln_score and aln_len >= 256: out = f'{qrec.id}\t{trec.id}\t{aln_score}\t{aln_len}' print(out, file=outfile)
def pairwise_ssw(fasta, outfile): headers = tuple(sorted(fasta.keys())) with open(outfile, 'w') as out: out.write( 'query_header,target_header,aln_score,qstart,qend,tstart,tend,cigar\n' ) for i, query_header in enumerate(headers): for j in range(i, len(headers)): target_header = headers[j] ssw = StripedSmithWaterman(fasta[query_header]) res = ssw(fasta[target_header]) out.write('{},{},{},{},{},{},{},{}\n'.format( query_header, target_header, res['optimal_alignment_score'], res['query_begin'], res['query_end'], res['target_begin'], res['target_end_optimal'], res['cigar']))
def test_align_with_N_in_nucleotide_sequence(self): expected = { 'optimal_alignment_score': 9, 'suboptimal_alignment_score': 0, 'query_begin': 0, 'query_end': 8, 'target_begin': 0, 'target_end_optimal': 9, 'target_end_suboptimal': 0, 'cigar': '4M1D5M', 'query_sequence': 'ACTCANNATCGANCTAGC', 'target_sequence': 'ACTCGAAAATGTNNGCA' } query = StripedSmithWaterman(expected['query_sequence']) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def test_lowercase_is_valid_sequence(self): expected = { 'optimal_alignment_score': 23, 'suboptimal_alignment_score': 10, 'query_begin': 0, 'query_end': 16, 'target_begin': 0, 'target_end_optimal': 20, 'target_end_suboptimal': 4, 'cigar': '6M4D11M', 'query_sequence': 'aaacgataaatccgcgta', 'target_sequence': 'aaacgactactaaatccgcgtgatagggga' } query = StripedSmithWaterman(expected['query_sequence']) alignment = query(expected['target_sequence']) self._check_alignment(alignment, expected)
def score(self, seq_a, seq_b): # StripedSmithWaterman expects str vs. unicode seq_a = str(trim_seq(seq_a)) seq_b = str(trim_seq(seq_b)) # From https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/ # blob/master/README.md: # "Note: When SSW open a gap, the gap open penalty alone is applied." from skbio.alignment import StripedSmithWaterman query = StripedSmithWaterman( seq_a, protein=True, gap_open_penalty=self.gap_penalty, gap_extend_penalty=self.gap_penalty, substitution_matrix=self.aa.as_int_dict()) # Normalize to be in line with SeqAlignScorer return query(seq_b)["optimal_alignment_score"] / 100.
def _align(self): if sys.version_info[0] == 2: query = self.query.sequence.encode('ascii') if isinstance( self.query.sequence, unicode) else self.query.sequence target = self.target.sequence.encode('ascii') if isinstance( self.target.sequence, unicode) else self.target.sequence else: query = self.query.sequence target = self.target.sequence aligner = StripedSmithWaterman(query, match_score=self._match, mismatch_score=self._mismatch, gap_open_penalty=self._gap_open, gap_extend_penalty=self._gap_extend, substitution_matrix=self._matrix, protein=self._aa) return aligner(target)
def test_arg_suppress_sequences(self): expected = { 'optimal_alignment_score': 100, 'suboptimal_alignment_score': 44, 'query_begin': 5, 'query_end': 54, 'target_begin': 0, 'target_end_optimal': 49, 'target_end_suboptimal': 21, 'cigar': '50M', 'query_sequence': '', 'target_sequence': '' } query = StripedSmithWaterman( "AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC", suppress_sequences=True) alignment = query("CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC") self._check_alignment(alignment, expected)
def test_is_zero_based_returns_true_if_index_base_is_zero(self): expected_alignments = [ ({ 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 'CCCCGGGCGGGGC'), 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 'GGGCGGGGC') }, True), ({ 'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG' 'CCCCGGGCGGGGC'), 'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC' 'GGGCGGGGC') }, False) ] for expected, z in expected_alignments: query = StripedSmithWaterman(expected['query_sequence'], zero_index=z) alignment = query(expected['target_sequence']) self.assertEqual(z, alignment.is_zero_based())
def within_cluster_analysis(clus_id, labels, maxiter=100): #Get a list of the actual read ID for reads belonging to given cluster read_ids = list(range(len(labels))) cluster_members = [i for i in read_ids if labels[i] == clus_id] if len(cluster_members) <= 1: #if the threshold was so high that no clusters are formed I guess I'll return 0 for similarity then? return 0.0, len(cluster_members) else: #Calculate alignment score for all combinations #but not all all combinations, maximally for the first 100 sequences in the cluster, otherwise the calculations are too long if len(cluster_members) < maxiter: maxiter = len(cluster_members) count = 0 total_score = 0 for i in range(maxiter): query = StripedSmithWaterman(reads[cluster_members[i]], score_only=True) #SPEED UP HERE WITH MULTIPROCESSING!? for j in range(i+1, maxiter): aln = query(reads[cluster_members[j]]) score = aln.optimal_alignment_score total_score += score count += 1 #print("Score: %.2f" % (total_score/count)) return total_score/count, len(cluster_members)
def align_to_reference_sequence(self, reference_sequence, n_base_pairs=None): #s = StripedSmithWaterman(reference_sequence, score_only=True) #alignment = s(self.read_sequence) #return alignment["optimal_alignment_score"], alignment["query_end"], alignment["target_end_optimal"] if n_base_pairs is None: n_base_pairs = len(reference_sequence) #logging.info("Aligning %d bp from %s to %s" % (n_base_pairs, self.read_sequence, reference_sequence)) ref_align = reference_sequence[max(0, self._current_ref_offset - 5):self._current_ref_offset + n_base_pairs] read_align = self.read_sequence[max(0, self._current_read_offset - 5):self._current_read_offset + n_base_pairs] #logging.debug("read offset: %d, ref offset: %d" % (self._current_ref_offset, self._current_read_offset)) #logging.debug("ALIGNING %s against %s" % (ref_align, read_align)) s = StripedSmithWaterman(ref_align, score_only=False) alignment = s(read_align) return alignment["optimal_alignment_score"], alignment[ "query_end"], alignment["target_end_optimal"]