def test_repeat_ambiguous(): myTR_O = repeat.Repeat(msa=TEST_MSA_O) myTR_K = repeat.Repeat(msa=TEST_MSA_K) assert myTR_O.msaTD_standard_aa == myTR_K.msaTD assert myTR_O.msaTD_standard_aa == myTR_K.msaTD_standard_aa assert myTR_O.score(TEST_SCORE) == myTR_K.score(TEST_SCORE) assert myTR_O.divergence(TEST_SCORE) == myTR_K.divergence(TEST_SCORE) assert myTR_O.pvalue(TEST_SCORE) == myTR_K.pvalue(TEST_SCORE) assert myTR_O.divergence(TEST_SCORE) == 2.095947265625 assert myTR_K.pvalue(TEST_SCORE) == 0.3507
def test_too_big_hmms(): test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_LONG) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE_A) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 0 test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_SUPER_LONG) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE_SUPER_LONG_A) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 0
def test_create_HMM_from_Repeat(): test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) assert test_hmm.l_effective == 2 assert set(test_hmm.states) == set(TEST_HMM_STATES_DOUBLE) assert test_hmm.p_0 == TEST_HMM_P0_DOUBLE #assert test_hmm.p_t == TEST_HMM_P0_DOUBLE test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_SINGLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) assert test_hmm.l_effective == 1 assert test_hmm.states == TEST_HMM_STATES_SINGLE assert test_hmm.p_0 == TEST_HMM_P0_SINGLE
def test_sequence_pickle(): test_seq = sequence.Sequence(TEST_SEQUENCE) test_pickle = os.path.join(path(), "test.pickle") test_seq.write(test_pickle, 'pickle') test_seq_new = sequence.Sequence.create(test_pickle, 'pickle') assert test_seq.seq == test_seq_new.seq test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_optimized_repeat = test_seq.detect([test_hmm]) test_seq.set_repeatlist(test_optimized_repeat, TEST_SEQUENCE_TAG) assert type(test_optimized_repeat) == repeat_list.RepeatList assert list(test_seq.d_repeatlist.keys()) == [TEST_SEQUENCE_TAG] assert type( test_seq.d_repeatlist[TEST_SEQUENCE_TAG]) == repeat_list.RepeatList assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats test_retrieved_repeatlist = test_seq.get_repeatlist(TEST_SEQUENCE_TAG) assert test_retrieved_repeatlist == test_optimized_repeat test_seq.write(test_pickle, 'pickle') test_seq_new = sequence.Sequence.create(test_pickle, 'pickle') assert test_seq.d_repeatlist.keys() == test_seq_new.d_repeatlist.keys() assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats[ 0].msa == test_seq_new.d_repeatlist[TEST_SEQUENCE_TAG].repeats[0].msa if os.path.exists(test_pickle): os.remove(test_pickle)
def test_viterbi(): # {Test_name: [Original_TR_MSA, Sequence, Viterbi_path, Refined_TR_MSA], ... } TEST = { "Single": [["A", "A", "A"], "AAAAAA", ["M1", "M1", "M1", "M1", "M1", "M1"], ["A", "A", "A", "A", "A", "A"]], "Double": [["AA", "AA"], "AAAAAA", ["M1", "M2", "M1", "M2", "M1", "M2"], ["AA", "AA", "AA"]], "Long": [["ADKL", "ADKL"], "GYRADKLADKLADKL", [ "N", "N", "N", "M1", "M2", "M3", "M4", "M1", "M2", "M3", "M4", "M1", "M2", "M3", "M4" ], ["ADKL", "ADKL", "ADKL"]] } for test, p in TEST.items(): test_repeat = repeat.Repeat(msa=p[0]) test_hmm = HMM.create(input_format="repeat", repeat=test_repeat) for iHMM in [test_hmm]: # Detect TRs on self.seq with hmm using the Viterbi algorithm. most_likely_path = iHMM.viterbi(p[1]) assert type(most_likely_path) == list assert most_likely_path == p[2] unaligned_msa = hmm_path_to_non_aligned_tandem_repeat_units( p[1], most_likely_path, iHMM.l_effective) assert unaligned_msa == p[3] aligned_msa = repeat_align.realign_repeat(unaligned_msa) assert aligned_msa == p[3]
def test_create_repeat_list_from_repeats(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] test_repeat_list = rl.RepeatList(repeats=test_repeats) assert len(test_repeat_list.repeats) == 4 for i, j in zip(TEST_REPEATS, test_repeat_list.repeats): assert i == j.msa
def test_detect_repeats_with_repeat(): test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 1 assert test_optimized_repeat.repeats[ 0].msa == TEST_RESULT_REPEAT_MSA_DOUBLE test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_SINGLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 1 assert test_optimized_repeat.repeats[ 0].msa == TEST_RESULT_REPEAT_MSA_SINGLE
def test_serialize_repeat_list_tsv(): test_repeats = [repeat.Repeat(msa = i) for i in TEST_REPEATS] test_seq = sequence.Sequence(TEST_SEQUENCE) for i in test_repeats: test_seq.repeat_in_sequence(i) test_repeat_list = rl.RepeatList(repeats = test_repeats) tsv = rl_io.serialize_repeat_list_tsv(test_repeat_list) assert type(tsv) == str
def test_serialize_repeat_list_tsv(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS[:2]] test_seq = sequence.Sequence(TEST_SEQUENCE) for i in test_repeats: test_seq.repeat_in_sequence(i) test_repeat_list = rl.RepeatList(repeats=test_repeats) tsv = test_repeat_list.write("tsv", return_string=True) assert type(tsv) == str
def test_filter_pvalue(): #test_repeats = [repeat.Repeat(msa = i, scoreslist = ["phylo_gap01"], calc_score = True, calc_pvalue = True) for i in TEST_REPEATS] test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] for i, j in zip(test_repeats, TEST_SCORE_VALUE_LIST): i.d_pvalue = {} i.d_pvalue[TEST_SCORE] = j test_repeat_list = rl.RepeatList(repeats=test_repeats) test_repeat_list_filtered = test_repeat_list.filter( "pvalue", TEST_SCORE, 0.1) assert len(test_repeat_list_filtered.repeats) == 1
def test_repeat_pickle(): myTR_O = repeat.Repeat(msa=TEST_MSA_O) test_pickle = os.path.join(path(), "test.pickle") myTR_O.write(test_pickle, 'pickle') myTR_O_new = repeat.Repeat.create(test_pickle, 'pickle') assert myTR_O.msa == myTR_O_new.msa assert myTR_O.sequence_type == myTR_O_new.sequence_type assert myTR_O.text == myTR_O_new.text if os.path.exists(test_pickle): os.remove(test_pickle)
def test_hmm_pickle(): test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_pickle = os.path.join(path(), "test.pickle") test_hmm.write(test_pickle, 'pickle') test_hmm_new = HMM.create(input_format='pickle', file=test_pickle) assert test_hmm.hmmer == test_hmm_new.hmmer assert test_hmm.alphabet == test_hmm_new.alphabet if os.path.exists(test_pickle): os.remove(test_pickle)
def test_repeat_score(): ### MAKE THIS TEST A TEST! Q,eqFreq,alphabet = repeat_io.loadModel() indelRatePerSite = 0.001 myTR = repeat.Repeat(msa = TEST_MSA, begin = TEST_BEGIN) myTR.deleteInsertionColumns() print(loglikelihood_gaps_starphylogeny_zipfian(t = 1, tandem_repeat = myTR)) print(optimisation(function = loglikelihood_substitution, args = [Q,eqFreq,alphabet,myTR])) print(optimisation(function = loglikelihood_substitutions_gaps, args = [[Q,eqFreq,alphabet,myTR],[myTR,indelRatePerSite]])) assert 1 == 2
def test_pairwise_overlap(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] for i, j in zip(test_repeats, TEST_BEGIN_LIST): i.begin = j assert rl.two_repeats_overlap("common_ancestry", *test_repeats[:2]) == False assert rl.two_repeats_overlap("common_ancestry", *test_repeats[1:3]) == False assert rl.two_repeats_overlap("common_ancestry", *test_repeats[2:]) == False assert rl.two_repeats_overlap("shared_char", *test_repeats[:2]) == False assert rl.two_repeats_overlap("shared_char", *test_repeats[1:3]) == True
def test_repeat_list_pickle(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] test_repeat_list = rl.RepeatList(repeats=test_repeats) test_pickle = os.path.join(path(), "test.pickle") test_repeat_list.write('pickle', test_pickle) test_repeat_list_new = repeat.Repeat.create(test_pickle, 'pickle') assert len(test_repeat_list.repeats) == len(test_repeat_list_new.repeats) assert test_repeat_list.repeats[0].msa == test_repeat_list_new.repeats[ 0].msa if os.path.exists(test_pickle): os.remove(test_pickle)
def test_filter_cluster_based(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] for i, j in zip(test_repeats, TEST_SCORE_VALUE_LIST): i.d_pvalue = {} i.d_pvalue[TEST_SCORE] = j for i, j in zip(test_repeats, TEST_BEGIN_LIST): i.begin = j test_repeat_list = rl.RepeatList(repeats=test_repeats) test_repeat_list.filter("pvalue", TEST_SCORE, 0.1) test_repeat_list_filtered = test_repeat_list.filter( "none_overlapping", ("common_ancestry", None), [("pvalue", TEST_SCORE), ("divergence", TEST_SCORE)]) assert len(test_repeat_list_filtered.repeats) == 3 for i in test_repeats[:3]: assert i in test_repeat_list_filtered.repeats
def test_cluster(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS] for i, j in zip(test_repeats, TEST_BEGIN_LIST): i.begin = j test_repeat_list = rl.RepeatList(repeats=test_repeats) test_repeat_list.cluster("common_ancestry") # Check whether both lists include exactly the same elements. for i in [{0}, {1, 3}, {2}]: assert i in test_repeat_list.d_cluster["common_ancestry"] assert len(test_repeat_list.d_cluster["common_ancestry"]) == 3 test_repeat_list.cluster("shared_char") # Check whether both lists include exactly the same elements. for i in [{0}, {1, 2, 3}]: assert i in test_repeat_list.d_cluster["shared_char"] assert len(test_repeat_list.d_cluster["shared_char"]) == 2
def detect(self, lHMM=None, denovo=None, **kwargs): """ Detects tandem repeats on ``self.seq`` from 2 possible sources. A list of ``Repeat`` instances is created for tandem repeat detections on the sequence from two possible sources: * Sequence profile hidden Markov models ``HMM`` * de novo detection algorithms. Args: hmm (HMM): A list of ``HMM`` instances. denovo (bool): boolean *kwargs: Parameters fed to denovo TR prediction and/or Repeat instantiation. E.g. ``repeat = {"calc_score": True}`` Returns: A ``RepeatList`` instance """ if lHMM: if not isinstance(lHMM, list): raise Exception('The lHMM value is not a list.') for iHMM in lHMM: if not isinstance(iHMM, hmm.HMM): raise Exception('At least one list element in the lHMM' 'value is not a valid instance of the HMM' 'class.') repeats = [] for iHMM in lHMM: # Detect TRs on self.seq with hmm using the Viterbi algorithm. most_likely_path = iHMM.viterbi(self.seq) LOG.debug("most_likely_path: {}".format(most_likely_path)) if not most_likely_path: continue unaligned_msa = hmm_viterbi.hmm_path_to_non_aligned_tandem_repeat_units( self.seq, most_likely_path, iHMM.l_effective) if len(unaligned_msa) > 1: # Align the msa aligned_msa = repeat_align.realign_repeat(unaligned_msa) if len(aligned_msa) > 1: # Create a Repeat() class with the new msa if 'repeat' in kwargs: repeats.append( repeat.Repeat(aligned_msa, **kwargs['repeat'])) else: repeats.append(repeat.Repeat(aligned_msa)) # Set begin coordinate for all repeats for i_repeat in repeats: self.repeat_in_sequence(i_repeat) return repeat_list.RepeatList(repeats) elif lHMM == []: LOG.debug("lHMM == []") return None elif denovo: if 'detection' in kwargs: predicted_repeats = repeat_detection_run.run_detector( [self], **kwargs['detection'])[0] else: predicted_repeats = repeat_detection_run.run_detector([self ])[0] LOG.debug("predicted_repeats: {}".format(predicted_repeats)) repeats = [] for jTRD, jlTR in predicted_repeats.items(): for iTR in jlTR: if 'repeat' in kwargs: iTR = repeat.Repeat(iTR.msa, begin=iTR.begin, **kwargs['repeat']) else: iTR = repeat.Repeat(iTR.msa, begin=iTR.begin) # Consider only tandem repeats that have a repeat unit # predicted to be at least one character long. if iTR.l_effective > 0: # Save l, n, MSA, TRD, scores, sequence_type, position # in sequence of given type iTR.TRD = jTRD # Sanity check repeat and set begin coordinate for # all repeats if not self.repeat_in_sequence(iTR): LOG.debug("The tandem repeat is not part of" \ "the sequence. Detector: %s", iTR.TRD) continue repeats.append(iTR) return repeat_list.RepeatList(repeats) else: raise Exception("Either require denovo detection, or provide an", "HMM")