Пример #1
0
    def test_arg_matrix_overrides_match_and_mismatch(self):
        query_sequences = [
            "TTATAATTAATTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
            "AGTCGAAGGGTAAGGGGTATAGGCGTGTCACCTA",
            "AGTCGAAGGGTAATA",
            "CTGCCTCAGGGGCGAGGAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
            "AGGGTAATTAGCGCGTGTTCACCTA"
        ]
        target_sequences = query_sequences
        matrix = {  # This is a biologically meaningless matrix
            "A": {"A": 4,  "T": -1, "C": -2, "G": -3, "N": 4},
            "T": {"A": -1, "T": 1,  "C": -1, "G": -4, "N": 1},
            "C": {"A": -2, "T": -1, "C": 10, "G": 1,  "N": 1},
            "G": {"A": -3, "T": -4, "C": 1,  "G": 3,  "N": 1},
            "N": {"A": 4,  "T": 1,  "C": 1,  "G": 1,  "N": 0}
        }
        for query_sequence in query_sequences:
            for target_sequence in target_sequences:
                query1 = StripedSmithWaterman(query_sequence)
                align1 = query1(target_sequence)

                query2 = StripedSmithWaterman(query_sequence,
                                              substitution_matrix=matrix)
                align2 = query2(target_sequence)

                self.assertNotEqual(align1.optimal_alignment_score,
                                    align2.optimal_alignment_score)
Пример #2
0
def getSimilarityScoreTwoProteinLocalAlignText(proteinA: str, proteinB: str):
    """
    | Get similarity score between two proteins with local alignment.

    :param proteinA: protein A
    :param proteinB: protein B to compare to protein A

    :type proteinA: ProteinJson
    :type phage_B: ProteinJson

    :return: similarity score between two phage
    :rtype: float16   
    """

    # using optimized Smit-Waterman
    query = StripedSmithWaterman(proteinA,
                                 protein=True,
                                 substitution_matrix=constants.MATRIX_BLOSUM62,
                                 gap_open_penalty=10,
                                 gap_extend_penalty=1,
                                 score_only=True)
    queryscoremax = StripedSmithWaterman(
        proteinA,
        protein=True,
        substitution_matrix=constants.MATRIX_BLOSUM62,
        gap_open_penalty=10,
        gap_extend_penalty=1,
        score_only=True)
    aligner_score = query(proteinB)
    aligner_score_max = queryscoremax(proteinA)

    # get scores from the return of query functions
    score = int(str(aligner_score).split(" ")[1])
    score_max = int(str(aligner_score_max).split(" ")[1])
    return score / score_max
Пример #3
0
    def _check_argument_with_inequality_on_optimal_align_score(
            self,
            query_sequences=None,
            target_sequences=None,
            arg=None,
            default=None,
            i_range=None,
            compare_lt=None,
            compare_gt=None):
        iterable_kwarg = {}
        default_kwarg = {}
        default_kwarg[arg] = default
        for query_sequence in query_sequences:
            for target_sequence in target_sequences:
                for i in i_range:
                    iterable_kwarg[arg] = i
                    query1 = StripedSmithWaterman(query_sequence,
                                                  **iterable_kwarg)
                    align1 = query1(target_sequence)

                    query2 = StripedSmithWaterman(query_sequence,
                                                  **default_kwarg)
                    align2 = query2(target_sequence)

                    if i == default:
                        self.assertEqual(align1.optimal_alignment_score,
                                         align2.optimal_alignment_score)
                    if i < default:
                        compare_lt(align1.optimal_alignment_score,
                                   align2.optimal_alignment_score)
                    if i > default:
                        compare_gt(align1.optimal_alignment_score,
                                   align2.optimal_alignment_score)
Пример #4
0
 def genotype_bam(self, bam_file, strain, read_length = 100, add = False, troubleshooting = False):
     unique_reads = {}
     ref_seq = self.ret_ref_allele(read_length)
     alt_seq = self.ret_alt_allele(read_length)
     if troubleshooting != False:
         print(ref_seq, file = troubleshooting)
         print(alt_seq, file = troubleshooting)
     reads1 = bam_file.fetch(self.chrom, max(0,self.start-10), self.start + 10)
     for read in reads1:
         if read.pos - 5 < self.start and read.pos + len(read.seq) > self.start:
             unique_reads[read.qname] = read
     reads2 = bam_file.fetch(self.chrom, max(0,self.stop-10),  self.stop + 10)
     for read in reads2:
         if read.pos -5 < self.stop and read.pos + len(read.seq) > self.stop:
             unique_reads[read.qname] = read
     geno = [0,0,0]
     ref_SSW = StripedSmithWaterman(ref_seq)
     alt_SSW = StripedSmithWaterman(alt_seq)
    
     for read in unique_reads.values():
         if not read.is_secondary and read.mapq > 10:
             ref_score = max(ref_SSW(read.seq).optimal_alignment_score, ref_SSW(reverse_complement(read.seq)).optimal_alignment_score)
             alt_score = max(alt_SSW(read.seq).optimal_alignment_score, alt_SSW(reverse_complement(read.seq)).optimal_alignment_score)
             if ref_score - alt_score > 10:
                 geno[0]+=1
             elif ref_score - alt_score < -10:
                 geno[1]+=1
             else:
                 geno[2]+=1
             if troubleshooting != False:
                 print(read.seq + '\t' + str(ref_SSW(read.seq).optimal_alignment_score) + '\t' + str(alt_SSW(read.seq).optimal_alignment_score), file = troubleshooting)
                 print(reverse_complement(read.seq) + '\t' + str(ref_SSW(reverse_complement(read.seq)).optimal_alignment_score) + '\t' + str(alt_SSW(reverse_complement(read.seq)).optimal_alignment_score), file = troubleshooting)
                 
     if len(unique_reads) == 0:
         geno_o = ('./.',tuple(geno))
     if geno[0] == 0 and geno[1] == 0:
         geno_o = ('./.',tuple(geno))
     elif geno[0] > 4*geno[1]:
         geno_o = ('0/0',tuple(geno))
     elif geno[0] < 1/4*geno[1]:
         geno_o = ('1/1',tuple(geno))
         self.empty = False
     else:
         geno_o = ('0/1',tuple(geno))
         self.empty = False
     if add == True:
         self.geno[strain] = geno_o
     return geno_o
Пример #5
0
 def test_arg_gap_extend_penalty(self):
     query_sequences = [
         "TTATAATTTTCTTATTATTATCAATATTTATAATTTGATTTTGTTGTAAT",
         "AGTCGAAGGGTAATACTAGGCGTGTCACCTA", "AGTCGAAGGGTAATA",
         "CTGCCTCAGGGGGAGGCAAAGCGTCAGCGCGGCTGCCGTCGGCGCAGGGGC",
         "AGGGTAATTAGGCGTGTTCACCTA"
     ]
     target_sequences = query_sequences
     self._check_argument_with_inequality_on_optimal_align_score(
         query_sequences=query_sequences,
         target_sequences=target_sequences,
         arg='gap_extend_penalty',
         default=2,
         i_range=range(1, 10),
         # These are intentionally inverted
         compare_lt=self.assertGreaterEqual,
         compare_gt=self.assertLessEqual)
     # The above is not a strict bound, so lets use an expected align
     # to plug the hole where every align is exactly equal to default
     expected = {
         'optimal_alignment_score': 9,
         'suboptimal_alignment_score': 8,
         'query_begin': 6,
         'query_end': 12,
         'target_begin': 7,
         'target_end_optimal': 13,
         'target_end_suboptimal': 38,
         'cigar': '7M',
         'query_sequence': 'TCTATAAGATTCCGCATGCGTTACTTATAAGATGTCTCAACGG',
         'target_sequence': 'GCCCAGTAGCTTCCCAATATGAGAGCATCAATTGTAGATCGGGCC'
     }
     query = StripedSmithWaterman(expected['query_sequence'],
                                  gap_extend_penalty=10)
     alignment = query(expected['target_sequence'])
     self._check_alignment(alignment, expected)
Пример #6
0
    def _check_bit_flag_sets_properties_falsy_or_negative(
            self,
            query_sequences=None,
            target_sequences=None,
            arg_settings=[],
            properties_to_null=[]):
        kwarg = {}

        def falsy_or_negative(alignment, prop):
            if type(alignment[prop]) is int:
                return alignment[prop] < 0
            else:
                return not alignment[prop]

        for query_sequence in query_sequences:
            for target_sequence in target_sequences:
                for arg, setting in arg_settings:
                    kwarg[arg] = setting
                query = StripedSmithWaterman(query_sequence, **kwarg)
                alignment = query(target_sequence)
                for prop in properties_to_null:
                    self.assertTrue(falsy_or_negative(alignment, prop))
                # Every property not in our null list
                for prop in [p for p in self.align_attributes
                             if p not in properties_to_null]:
                    self.assertFalse(falsy_or_negative(alignment, prop))
Пример #7
0
 def test_regression_on_instantiation_arguments(self):
     expected = {
         'optimal_alignment_score': 23,
         'suboptimal_alignment_score': 10,
         'query_begin': 0,
         'query_end': 16,
         'target_begin': 0,
         'target_end_optimal': 20,
         'target_end_suboptimal': 4,
         'cigar': '6M4D11M',
         'query_sequence': 'AAACGATAAATCCGCGTA',
         'target_sequence': 'AAACGACTACTAAATCCGCGTGATAGGGGA'
     }
     query = StripedSmithWaterman(expected['query_sequence'],
                                  gap_open_penalty=5,
                                  gap_extend_penalty=2,
                                  score_size=2,
                                  mask_length=15,
                                  mask_auto=True,
                                  score_only=False,
                                  score_filter=None,
                                  distance_filter=None,
                                  override_skip_babp=False,
                                  protein=False,
                                  match_score=2,
                                  mismatch_score=-3,
                                  substitution_matrix=None,
                                  suppress_sequences=False,
                                  zero_index=True)
     alignment = query(expected['target_sequence'])
     self._check_alignment(alignment, expected)
Пример #8
0
 def test_protein_sequence_is_usable(self):
     expected = {
         'optimal_alignment_score':
         316,
         'suboptimal_alignment_score':
         95,
         'query_begin':
         0,
         'query_end':
         52,
         'target_begin':
         0,
         'target_end_optimal':
         52,
         'target_end_suboptimal':
         18,
         'cigar':
         '15M1D15M1I22M',
         'query_sequence': ('VHLTGEEKSAVAALWGKVNVDEVGGEALGRXLLVVYPWTQRFFESF'
                            'SDLSTPDABVMSNPKVKAHGK'),
         'target_sequence': ('VHLTPEEKSAVTALWBGKVNVDEVGGEALGRLLVVYPWTQRFFES'
                             'FGDLSTPD*')
     }
     query = StripedSmithWaterman(expected['query_sequence'],
                                  protein=True,
                                  substitution_matrix=blosum50)
     alignment = query(expected['target_sequence'])
     self._check_alignment(alignment, expected)
Пример #9
0
 def test_works_for_dot_and_square_bracket_access(self):
     q_seq = "AGGGTAATTAGGCGTGTTCACCTA"
     query = StripedSmithWaterman(q_seq)
     alignment = query("TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG")
     for accessible in self.align_attributes:
         self.assertEqual(getattr(alignment, accessible),
                          alignment[accessible])
Пример #10
0
 def test_aligned_query_target_sequence(self):
     query = StripedSmithWaterman("AGGGTAATTAGGCGTGTTCACCTA")
     alignment = query("AGTCGAAGGGTAATATAGGCGTGTCACCTA")
     self.assertEqual("AGGGTAATATAGGCGT-GTCACCTA",
                      alignment.aligned_target_sequence)
     self.assertEqual("AGGGTAAT-TAGGCGTGTTCACCTA",
                      alignment.aligned_query_sequence)
Пример #11
0
def get_brute_force_mapping(A_kmer_list, B_kmer_list):

    mapping_list = []

    for a_kmer in A_kmer_list:

        sw_score_generator = StripedSmithWaterman(a_kmer,
                                                  match_score=1,
                                                  mismatch_score=-1,
                                                  gap_open_penalty=1,
                                                  gap_extend_penalty=1,
                                                  mask_length=0)

        max_score = 0

        max_score_b_kmer = ""

        for b_kmer in B_kmer_list:

            sw_score = sw_score_generator(b_kmer)['optimal_alignment_score']

            if sw_score >= SW_SCORE_THRESHOLD and sw_score > max_score:

                max_score = sw_score

                max_score_b_kmer = b_kmer

        if max_score != 0:

            mapping_list.append((a_kmer, max_score, max_score_b_kmer))

    return mapping_list
Пример #12
0
 def test_same_as_using_StripedSmithWaterman_object(self):
     query_sequence = 'ATGGAAGCTATAAGCGCGGGTGAG'
     target_sequence = 'AACTTATATAATAAAAATTATATATTCGTTGGGTTCTTTTGATATAAATC'
     query = StripedSmithWaterman(query_sequence)
     align1 = query(target_sequence)
     align2 = local_pairwise_align_ssw(query_sequence, target_sequence)
     self._check_Alignment_to_AlignmentStructure(align2, align1)
Пример #13
0
def within_cluster_analysis(clus_id, labels, umis, maxiter=200):
    #Get a list of the actual read ID for reads belonging to the given cluster
    read_ids = list(range(len(labels)))
    cluster_members = [i for i in read_ids if labels[i] == clus_id]
    if len(
            cluster_members
    ) <= 1:  #if the threshold was so high that no clusters are formed I'll return a 0 as score for now.
        return 0.0, len(cluster_members)
    else:
        #Calculate alignment score for all combinations within this cluster
        #but not all all combinations, maximally for the first 100 sequences in the cluster, otherwise the calculations are too long
        if len(cluster_members) < maxiter:
            maxiter = len(cluster_members)
        count = 0
        total_score = 0
        for i in range(maxiter):
            #Align: Doesn't need to be multicore, as there are only few alignments to be done (nr = maxiter)
            #Main calculation burden is in the clustering, which already is multiprocessed
            query = StripedSmithWaterman(str(umis[cluster_members[i]].seq))
            for j in range(i + 1, maxiter):
                aln = query(str(umis[cluster_members[j]].seq))
                score = aln.optimal_alignment_score
                total_score += score
                count += 1
        #print("Score: %.2f" % (total_score/count))
        return total_score / count, len(cluster_members)
Пример #14
0
def fasta_metric(s, S):
    query = StripedSmithWaterman(s,
                                 protein=True,
                                 substitution_matrix=subs_mat(
                                     MatrixInfo.pam250))
    scores = []
    for t in S:
        scores.append(query(t)['optimal_alignment_score'])
    return scores
Пример #15
0
 def _align(self):
     aligner = StripedSmithWaterman(self.query.sequence,
                                    match_score=self._match,
                                    mismatch_score=self._mismatch,
                                    gap_open_penalty=self._gap_open,
                                    gap_extend_penalty=self._gap_extend,
                                    substitution_matrix=self._matrix,
                                    protein=self._aa)
     return aligner(self.target.sequence)
Пример #16
0
 def align_to_reference_sequence(self,
                                 reference_sequence,
                                 n_base_pairs=None):
     if n_base_pairs is None:
         n_base_pairs = len(reference_sequence)
     ##logging.debug("Aligning %d bp from %s to %s" % (n_base_pairs, self.read_sequence, reference_sequence))
     s = StripedSmithWaterman(reference_sequence[0:n_base_pairs],
                              score_only=True)
     alignment = s(self.read_sequence[0:n_base_pairs])
     return alignment["optimal_alignment_score"]
Пример #17
0
 def test_kwargs_are_usable(self):
     kwargs = {}
     kwargs['mismatch_score'] = -2
     kwargs['match_score'] = 5
     query_sequence = 'AGGGTAATTAGGCGTGTTCACCTA'
     target_sequence = 'TACTTATAAGATGTCTCAACGGCATGCGCAACTTGTGAAGTG'
     query = StripedSmithWaterman(query_sequence, **kwargs)
     align1 = query(target_sequence)
     align2 = local_pairwise_align_ssw(query_sequence,
                                       target_sequence, **kwargs)
     self._check_Alignment_to_AlignmentStructure(align2, align1)
Пример #18
0
 def test_same_as_using_StripedSmithWaterman_object_Protein(self):
     query_sequence = 'HEAGAWGHEE'
     target_sequence = 'PAWHEAE'
     query = StripedSmithWaterman(query_sequence,
                                  protein=True,
                                  substitution_matrix=blosum50)
     align1 = query(target_sequence)
     align2 = local_pairwise_align_ssw(Protein(query_sequence),
                                       Protein(target_sequence),
                                       substitution_matrix=blosum50)
     self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein)
Пример #19
0
def _fasta_similarity_func_sym(seq):
    norm = seq_norm(seq)
    query = StripedSmithWaterman(seq,
                                 protein=True,
                                 substitution_matrix=substitution_data)

    def similarity(_seq_):
        norm_ = seq_norm(_seq_)
        return query(_seq_).optimal_alignment_score / np.sqrt(norm * norm_)

    return np.vectorize(similarity)
Пример #20
0
 def test_arg_zero_index_changes_base_of_index_to_0_or_1(self):
     expected_alignments = [
         ({
             'optimal_alignment_score':
             100,
             'suboptimal_alignment_score':
             44,
             'query_begin':
             5,
             'query_end':
             54,
             'target_begin':
             0,
             'target_end_optimal':
             49,
             'target_end_suboptimal':
             21,
             'cigar':
             '50M',
             'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
                                'CCCCGGGCGGGGC'),
             'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
                                 'GGGCGGGGC')
         }, True),
         ({
             'optimal_alignment_score':
             100,
             'suboptimal_alignment_score':
             44,
             'query_begin':
             6,
             'query_end':
             55,
             'target_begin':
             1,
             'target_end_optimal':
             50,
             'target_end_suboptimal':
             22,
             'cigar':
             '50M',
             'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
                                'CCCCGGGCGGGGC'),
             'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
                                 'GGGCGGGGC')
         }, False)
     ]
     for expected, z in expected_alignments:
         query = StripedSmithWaterman(expected['query_sequence'],
                                      zero_index=z)
         alignment = query(expected['target_sequence'])
         self._check_alignment(alignment, expected)
Пример #21
0
def ssw_search_contig(outfile, seq_identity, queries, targets):
    targets = list(SeqIO.parse(targets, 'fasta'))
    for qrec in SeqIO.parse(queries, 'fasta'):
        aligner = StripedSmithWaterman(str(qrec.seq))
        for trec in targets:
            alignment = aligner(str(trec.seq))
            aln_len = len(alignment.aligned_query_sequence)
            aln_score = alignment.optimal_alignment_score
            aln_score /= (2 * aln_len)
            aln_score *= 100
            if seq_identity <= aln_score and aln_len >= 256:
                out = f'{qrec.id}\t{trec.id}\t{aln_score}\t{aln_len}'
                print(out, file=outfile)
Пример #22
0
def pairwise_ssw(fasta, outfile):
    headers = tuple(sorted(fasta.keys()))
    with open(outfile, 'w') as out:
        out.write(
            'query_header,target_header,aln_score,qstart,qend,tstart,tend,cigar\n'
        )
        for i, query_header in enumerate(headers):
            for j in range(i, len(headers)):
                target_header = headers[j]
                ssw = StripedSmithWaterman(fasta[query_header])
                res = ssw(fasta[target_header])
                out.write('{},{},{},{},{},{},{},{}\n'.format(
                    query_header, target_header,
                    res['optimal_alignment_score'], res['query_begin'],
                    res['query_end'], res['target_begin'],
                    res['target_end_optimal'], res['cigar']))
Пример #23
0
 def test_align_with_N_in_nucleotide_sequence(self):
     expected = {
         'optimal_alignment_score': 9,
         'suboptimal_alignment_score': 0,
         'query_begin': 0,
         'query_end': 8,
         'target_begin': 0,
         'target_end_optimal': 9,
         'target_end_suboptimal': 0,
         'cigar': '4M1D5M',
         'query_sequence': 'ACTCANNATCGANCTAGC',
         'target_sequence': 'ACTCGAAAATGTNNGCA'
     }
     query = StripedSmithWaterman(expected['query_sequence'])
     alignment = query(expected['target_sequence'])
     self._check_alignment(alignment, expected)
Пример #24
0
 def test_lowercase_is_valid_sequence(self):
     expected = {
         'optimal_alignment_score': 23,
         'suboptimal_alignment_score': 10,
         'query_begin': 0,
         'query_end': 16,
         'target_begin': 0,
         'target_end_optimal': 20,
         'target_end_suboptimal': 4,
         'cigar': '6M4D11M',
         'query_sequence': 'aaacgataaatccgcgta',
         'target_sequence': 'aaacgactactaaatccgcgtgatagggga'
     }
     query = StripedSmithWaterman(expected['query_sequence'])
     alignment = query(expected['target_sequence'])
     self._check_alignment(alignment, expected)
Пример #25
0
    def score(self, seq_a, seq_b):
        # StripedSmithWaterman expects str vs. unicode
        seq_a = str(trim_seq(seq_a))
        seq_b = str(trim_seq(seq_b))

        # From https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/
        # blob/master/README.md:
        # "Note: When SSW open a gap, the gap open penalty alone is applied."
        from skbio.alignment import StripedSmithWaterman
        query = StripedSmithWaterman(
            seq_a, protein=True,
            gap_open_penalty=self.gap_penalty, gap_extend_penalty=self.gap_penalty,
            substitution_matrix=self.aa.as_int_dict())

        # Normalize to be in line with SeqAlignScorer
        return query(seq_b)["optimal_alignment_score"] / 100.
Пример #26
0
 def _align(self):
     if sys.version_info[0] == 2:
         query = self.query.sequence.encode('ascii') if isinstance(
             self.query.sequence, unicode) else self.query.sequence
         target = self.target.sequence.encode('ascii') if isinstance(
             self.target.sequence, unicode) else self.target.sequence
     else:
         query = self.query.sequence
         target = self.target.sequence
     aligner = StripedSmithWaterman(query,
                                    match_score=self._match,
                                    mismatch_score=self._mismatch,
                                    gap_open_penalty=self._gap_open,
                                    gap_extend_penalty=self._gap_extend,
                                    substitution_matrix=self._matrix,
                                    protein=self._aa)
     return aligner(target)
Пример #27
0
 def test_arg_suppress_sequences(self):
     expected = {
         'optimal_alignment_score': 100,
         'suboptimal_alignment_score': 44,
         'query_begin': 5,
         'query_end': 54,
         'target_begin': 0,
         'target_end_optimal': 49,
         'target_end_suboptimal': 21,
         'cigar': '50M',
         'query_sequence': '',
         'target_sequence': ''
     }
     query = StripedSmithWaterman(
         "AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC",
         suppress_sequences=True)
     alignment = query("CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCCGGGCGGGGC")
     self._check_alignment(alignment, expected)
Пример #28
0
 def test_is_zero_based_returns_true_if_index_base_is_zero(self):
     expected_alignments = [
         ({
             'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
                                'CCCCGGGCGGGGC'),
             'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
                                 'GGGCGGGGC')
         }, True),
         ({
             'query_sequence': ('AGTCACGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCG'
                                'CCCCGGGCGGGGC'),
             'target_sequence': ('CGCGCGCCGCCGGGGGGCCGGCCGGCGCCGGGGGGCGCCCC'
                                 'GGGCGGGGC')
         }, False)
     ]
     for expected, z in expected_alignments:
         query = StripedSmithWaterman(expected['query_sequence'],
                                      zero_index=z)
         alignment = query(expected['target_sequence'])
         self.assertEqual(z, alignment.is_zero_based())
Пример #29
0
def within_cluster_analysis(clus_id, labels, maxiter=100):
    #Get a list of the actual read ID for reads belonging to given cluster
    read_ids = list(range(len(labels)))
    cluster_members = [i for i in read_ids if labels[i] == clus_id]
    if len(cluster_members) <= 1:  #if the threshold was so high that no clusters are formed I guess I'll return 0 for similarity then?
        return 0.0, len(cluster_members)
    else:
        #Calculate alignment score for all combinations
        #but not all all combinations, maximally for the first 100 sequences in the cluster, otherwise the calculations are too long
        if len(cluster_members) < maxiter:
            maxiter = len(cluster_members)
        count = 0
        total_score = 0
        for i in range(maxiter):
            query = StripedSmithWaterman(reads[cluster_members[i]], score_only=True)   #SPEED UP HERE WITH MULTIPROCESSING!?
            for j in range(i+1, maxiter):
                aln = query(reads[cluster_members[j]])
                score = aln.optimal_alignment_score
                total_score += score
                count += 1
        #print("Score: %.2f" % (total_score/count))
        return total_score/count, len(cluster_members)
Пример #30
0
    def align_to_reference_sequence(self,
                                    reference_sequence,
                                    n_base_pairs=None):
        #s = StripedSmithWaterman(reference_sequence, score_only=True)
        #alignment = s(self.read_sequence)
        #return alignment["optimal_alignment_score"], alignment["query_end"], alignment["target_end_optimal"]

        if n_base_pairs is None:
            n_base_pairs = len(reference_sequence)
        #logging.info("Aligning %d bp from %s to %s" % (n_base_pairs, self.read_sequence, reference_sequence))
        ref_align = reference_sequence[max(0, self._current_ref_offset -
                                           5):self._current_ref_offset +
                                       n_base_pairs]
        read_align = self.read_sequence[max(0, self._current_read_offset -
                                            5):self._current_read_offset +
                                        n_base_pairs]
        #logging.debug("read offset: %d, ref offset: %d" % (self._current_ref_offset, self._current_read_offset))
        #logging.debug("ALIGNING %s against %s" % (ref_align, read_align))
        s = StripedSmithWaterman(ref_align, score_only=False)
        alignment = s(read_align)
        return alignment["optimal_alignment_score"], alignment[
            "query_end"], alignment["target_end_optimal"]