def test_too_big_hmms(): test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_LONG) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE_A) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 0 test_repeat = repeat.Repeat(msa=TEST_RESULT_REPEAT_MSA_SUPER_LONG) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE_SUPER_LONG_A) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 0
def test_sequence_pickle(): test_seq = sequence.Sequence(TEST_SEQUENCE) test_pickle = os.path.join(path(), "test.pickle") test_seq.write(test_pickle, 'pickle') test_seq_new = sequence.Sequence.create(test_pickle, 'pickle') assert test_seq.seq == test_seq_new.seq test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_optimized_repeat = test_seq.detect([test_hmm]) test_seq.set_repeatlist(test_optimized_repeat, TEST_SEQUENCE_TAG) assert type(test_optimized_repeat) == repeat_list.RepeatList assert list(test_seq.d_repeatlist.keys()) == [TEST_SEQUENCE_TAG] assert type( test_seq.d_repeatlist[TEST_SEQUENCE_TAG]) == repeat_list.RepeatList assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats test_retrieved_repeatlist = test_seq.get_repeatlist(TEST_SEQUENCE_TAG) assert test_retrieved_repeatlist == test_optimized_repeat test_seq.write(test_pickle, 'pickle') test_seq_new = sequence.Sequence.create(test_pickle, 'pickle') assert test_seq.d_repeatlist.keys() == test_seq_new.d_repeatlist.keys() assert test_seq.d_repeatlist[TEST_SEQUENCE_TAG].repeats[ 0].msa == test_seq_new.d_repeatlist[TEST_SEQUENCE_TAG].repeats[0].msa if os.path.exists(test_pickle): os.remove(test_pickle)
def test_detect_TRUST(): test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0) predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["TRUST"])[0]['TRUST'] # Warning: TRUST finds these results ONLY with the BLOSUM50 substitution matrix. assert len(predicted_repeats) == 3 # Warning: TRUST finds these results ONLY with the BLOSUM50 substitution matrix. assert predicted_repeats[0].msa == ['HLREDIAQIP---TCAEAGE---QEGRLQR', 'KQKNATGGRR--HICHECGKSFAQSSGLSK', 'HRRIHTGEKP--YECEECGKAFIGSSALVI', 'HQRVHTGEKP--YECEECGKAFSHSSDLIK', 'HQRTHTGEKP--YECDDCGKTFSQSCSLLE', 'HHRIHTGEKP--YQCSMCGKAFRRSSHLLR', 'HQRIHTGDKN--VQEPEQGEAW--KSRM--', 'ESQLENVETPmsYKCNECERSFTQNTGLIE', 'HQKIHTGEKP--YQCNACGKGFTRISYLVQ']
def test_detect_repeats_denovo(): test_parameters = {"detection": {"detectors": ["TRUST"]}} test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0) test_optimized_repeat = test_seq.detect(denovo=True, **test_parameters) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 3
def test_serialize_repeat_list_tsv(): test_repeats = [repeat.Repeat(msa=i) for i in TEST_REPEATS[:2]] test_seq = sequence.Sequence(TEST_SEQUENCE) for i in test_repeats: test_seq.repeat_in_sequence(i) test_repeat_list = rl.RepeatList(repeats=test_repeats) tsv = test_repeat_list.write("tsv", return_string=True) assert type(tsv) == str
def test_serialize_repeat_list_tsv(): test_repeats = [repeat.Repeat(msa = i) for i in TEST_REPEATS] test_seq = sequence.Sequence(TEST_SEQUENCE) for i in test_repeats: test_seq.repeat_in_sequence(i) test_repeat_list = rl.RepeatList(repeats = test_repeats) tsv = rl_io.serialize_repeat_list_tsv(test_repeat_list) assert type(tsv) == str
def test_detect_repeats_with_repeat(): test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_DOUBLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_seq = sequence.Sequence(TEST_SEQUENCE) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 1 assert test_optimized_repeat.repeats[ 0].msa == TEST_RESULT_REPEAT_MSA_DOUBLE test_repeat = repeat.Repeat(msa=TEST_REPEAT_MSA_SINGLE) test_hmm = HMM.create(input_format='repeat', repeat=test_repeat) test_optimized_repeat = test_seq.detect([test_hmm]) assert type(test_optimized_repeat) == repeat_list.RepeatList assert len(test_optimized_repeat.repeats) == 1 assert test_optimized_repeat.repeats[ 0].msa == TEST_RESULT_REPEAT_MSA_SINGLE
def workflow(sequences_file, hmm_annotation_file, hmm_dir, result_file, result_file_serialized, format, max_time, time_interval=3600, next_time=3600, **kwargs): ''' Annotate sequences with TRs from multiple sources, test and refine annotations. Save the annotations in a pickle. Args: sequences_file (str): Path to the pickle file containing a list of ``Sequence`` instances. hmm_dir (str): Path to directory where all HMMs are stored as .pickles result_file (str): Path to the result file. max_time (str): Max run time in seconds Raises: Exception: If the pickle ``sequences_file`` cannot be loaded Exception: if the hmm_dir does not exist ''' start = datetime.datetime.now() max_time, time_interval, next_time = int(max_time), int( time_interval), int(next_time) try: l_sequence = Fasta(sequences_file) except: raise Exception( "Cannot load putative pickle file sequences_file: {}".format( sequences_file)) if not os.path.isdir(hmm_dir): try: os.makedirs(hmm_dir) except: raise Exception( "hmm_dir does not exists and could not be created: {}".format( hmm_dir)) try: with open(hmm_annotation_file, 'rb') as fh: dHMM_annotation = pickle.load(fh) except: raise Exception( "Cannot load hmm_annotation_file: {}".format(hmm_annotation_file)) basic_filter = CONFIG['filter']['basic']['dict'] basic_filter_tag = CONFIG['filter']['basic']['tag'] # Load previous results try: if not os.path.isdir(os.path.dirname(result_file)): os.makedirs(os.path.dirname(result_file)) except: raise Exception( "Could not create path to result_file directory: {}".format( os.path.dirname(result_file))) try: with open(result_file, 'rb') as fh: dResults = pickle.load(fh) except: LOG.debug( "Could not load previous results file - perhaps non existant: {}". format(result_file)) dResults = {} dHMM = {} for iS_pyfaidx in l_sequence: # If sequence is already included in results: continue. if iS_pyfaidx.name in dResults: continue elapsed_time = (datetime.datetime.now() - start).seconds if elapsed_time > max_time or elapsed_time > next_time: with open(result_file, 'wb') as fh: pickle.dump(dResults, fh) next_time = next_time + time_interval iS = sequence.Sequence(seq=str(iS_pyfaidx), name=iS_pyfaidx.name) LOG.debug("Work on sequence {}".format(iS)) # 1. annotate_de_novo() denovo_repeat_list = iS.detect(denovo=True, repeat={"calc_pvalue": True}) LOG.debug(denovo_repeat_list.repeats) for iTR in denovo_repeat_list.repeats: iTR.model = None # 2. annotate_TRs_from_hmmer() if iS.name in dHMM_annotation: lHMM = dHMM_annotation[iS.name] infoNRuns = len(lHMM) LOG.debug( "{} Viterbi runs need to be performed.".format(infoNRuns)) lHMM = set(lHMM) infoNHMM = len(lHMM) LOG.debug( "These derive from {} independent HMMs.".format(infoNHMM)) # Load all HMM pickles needed for the particular sequence. for hmm_ID in lHMM: if hmm_ID not in dHMM: dHMM[hmm_ID] = hmm.HMM.create(file_format="pickle", file=os.path.join( hmm_dir, hmm_ID + ".pickle")) pfam_repeat_list = iS.detect([dHMM[hmm_ID] for hmm_ID in lHMM], repeat={"calc_pvalue": True}) for iTR, hmm_ID in zip(pfam_repeat_list.repeats, lHMM): iTR.model = hmm_ID iTR.TRD = "PFAM" else: pfam_repeat_list = None # 3. merge_and_basic_filter() all_repeat_list = denovo_repeat_list + pfam_repeat_list iS.set_repeatlist(all_repeat_list, REPEAT_LIST_TAG) iS.set_repeatlist(denovo_repeat_list, DE_NOVO_ALL_TAG) iS.set_repeatlist(pfam_repeat_list, PFAM_ALL_TAG) rl_tmp = iS.get_repeatlist(REPEAT_LIST_TAG) if iS.get_repeatlist(REPEAT_LIST_TAG): for iB in basic_filter.values(): rl_tmp = rl_tmp.filter(**iB) else: rl_tmp = iS.get_repeatlist(REPEAT_LIST_TAG) iS.set_repeatlist(rl_tmp, basic_filter_tag) iS.set_repeatlist(rl_tmp.intersection(iS.get_repeatlist(PFAM_ALL_TAG)), PFAM_TAG) iS.set_repeatlist( rl_tmp.intersection(iS.get_repeatlist(DE_NOVO_ALL_TAG)), DE_NOVO_TAG) # 4. calculate_overlap() # Perform common ancestry overlap filter and keep PFAMs criterion_pfam_fixed = { "func_name": "none_overlapping_fixed_repeats", "rl_fixed": iS.get_repeatlist(PFAM_TAG), "overlap_type": "common_ancestry" } iS.d_repeatlist[DE_NOVO_TAG] = iS.get_repeatlist(DE_NOVO_TAG).filter( **criterion_pfam_fixed) # Choose only the most convincing de novo TRs criterion_filter_order = { "func_name": "none_overlapping", "overlap": ("common_ancestry", None), "l_criterion": [("pvalue", "phylo_gap01"), ("divergence", "phylo_gap01")] } iS.d_repeatlist[DE_NOVO_TAG] = iS.get_repeatlist(DE_NOVO_TAG).filter( **criterion_filter_order) # 5. refine_denovo() denovo_final = [] denovo_refined = [None] * len( iS.get_repeatlist(DE_NOVO_ALL_TAG).repeats) for i, iTR in enumerate(iS.get_repeatlist(DE_NOVO_ALL_TAG).repeats): if not iTR in iS.get_repeatlist(DE_NOVO_TAG).repeats: continue # Create HMM from TR denovo_hmm = hmm.HMM.create(file_format='repeat', repeat=iTR) # Run HMM on sequence denovo_refined_rl = iS.detect(lHMM=[denovo_hmm]) append_refined = False if denovo_refined_rl and denovo_refined_rl.repeats: iTR_refined = denovo_refined_rl.repeats[0] iTR_refined.TRD = iTR.TRD iTR_refined.model = "cpHMM" denovo_refined[i] = iTR_refined # Check whether new and old TR overlap. Check whether new TR is # significant. If not both, put unrefined TR into final. if repeat_list.two_repeats_overlap("shared_char", iTR, iTR_refined): rl_tmp = repeat_list.RepeatList([iTR_refined]) LOG.debug(iTR_refined.msa) for iB in basic_filter.values(): rl_tmp = rl_tmp.filter(**iB) if rl_tmp.repeats: append_refined = True else: denovo_refined[i] = False if append_refined: denovo_final.append(iTR_refined) else: denovo_final.append(iTR) iS.set_repeatlist(repeat_list.RepeatList(denovo_refined), DE_NOVO_REFINED_TAG) iS.set_repeatlist(repeat_list.RepeatList(denovo_final), DE_NOVO_FINAL_TAG) iS.set_repeatlist( iS.get_repeatlist(DE_NOVO_FINAL_TAG) + iS.get_repeatlist(PFAM_TAG), FINAL_TAG) dResults[iS.name] = iS # 6.a Save results as pickle with open(result_file, 'wb') as fh: pickle.dump(dResults, fh) # 6.b Save serialized results with open(result_file_serialized, 'w') as fh_o: if format == 'tsv': header = [ "ID", "MSA", "begin", "pvalue", "l_effective", "n", "n_effective", "TRD", "model" ] fh_o.write("\t".join(header)) for iS in dResults.values(): for iTR in iS.get_repeatlist(FINAL_TAG).repeats: if format == 'tsv': try: data = [ str(i) for i in [ iS.name, " ".join(iTR.msa), iTR.begin, iTR.pvalue("phylo_gap01"), iTR.l_effective, iTR.n, iTR.n_effective, iTR.TRD, iTR.model ] ] except: print(iTR) raise Exception( "(Could not save data for the above TR.)") fh_o.write("\n" + "\t".join(data)) print("DONE")
def test_detect_repeats_with_hmm(): test_hmm = HMM.create(input_format='hmmer', file=os.path.join(path(), TEST_FILE_WITH_ID)) test_seq = sequence.Sequence(TEST_SEQUENCE) test_optimized_repeat = test_seq.detect([test_hmm])
def test_initialise_sequence(): test_seq = sequence.Sequence(TEST_SEQUENCE) assert test_seq.seq == TEST_SEQUENCE
def find_protein_repeats(sequences_file, result_dir, pvalue_threshold = 0.05, divergence_threshold = 0.1, n_threshold = 2.5, l_threshold = 3): """ Finds tandem repeats (TRs) in the protein sequences provided in 'sequence_file'. Filters the TRs according to the thresholds. Saves the TRs in the 'result_dir' each individually with the protein identifier as filename. Args: sequence_file (str): whole path to file with >=1 protein fasta-sequence result_dir (str): path where the individual TRAL results are supposed to be stored. pvalue_threshold (int): p-value threshold for filtering divergence_threshold (int): divergence threshold for filtering n_threshold (int): minimun repeat unit count for filtering l_threshold (int): maximum repeat unit length for filtering Output: For each protein sequence, a single file with its TRs is produced and stored in 'result_dir'. """ logging.config.fileConfig(config_file("logging.ini")) log = logging.getLogger('root') # define this as in the config CONFIG_GENERAL = configuration.Configuration.instance().config CONFIG = CONFIG_GENERAL["repeat_list"] score = CONFIG["model"] ########################################################################## # From .fasta to Class Sequence sequences proteins = Fasta(sequences_file) all_denovo_repeats = 0 all_filtered_repeats = 0 for pyfaidx in proteins: seq_name = pyfaidx.name.split("|")[1] # name is protein identifier seq = sequence.Sequence(seq=str(pyfaidx), name=seq_name) log.debug("Work on sequence {}".format(seq_name)) ########################################################################## # Getting TRs denovo_list = seq.detect(denovo=True) for TR in denovo_list.repeats: TR.calculate_pvalues() ########################################################################## # Filtering TRs # add number of denovo found repeats all_denovo_repeats += len(denovo_list.repeats) # filtering for pvalue denovo_list = denovo_list.filter( "pvalue", score, pvalue_threshold) # filtering for divergence denovo_list = denovo_list.filter( "divergence", score, divergence_threshold) # filtering for number of repeat units denovo_list = denovo_list.filter( "attribute", "n_effective", "min", n_threshold) # filtering for length of repeat units denovo_list = denovo_list.filter( "attribute", "l_effective", "max", l_threshold) ########################################################################## # Building HMM with hmmbuild # # De novo TRs were remastered with HMM denovo_hmm = [hmm.HMM.create(input_format='repeat', repeat=iTR) for iTR in denovo_list.repeats] # only possible with hmmbuild denovo_list_remastered = seq.detect(lHMM=denovo_hmm) ########################################################################## # Clustering # De novo TRs were clustered for overlap (common ancestry). Only best = # lowest p-Value and lowest divergence were retained. denovo_list_remastered = denovo_list.filter( "none_overlapping", ["common_ancestry"], {"pvalue": score, "divergence": score}) ########################################################################## # Save Tandem Repeats # Create output directory if not already exists. try: if not os.path.isdir(result_dir): os.makedirs(result_dir) except: raise Exception( "Could not create path to result directory: {}".format( os.path.dirname(result_dir))) # create filename output_pickle_file = os.path.join(result_dir, seq_name + ".pkl") output_tsv_file = os.path.join(result_dir, seq_name + ".tsv") # save TR-file denovo_list_remastered.write(output_format="pickle", file=output_pickle_file) denovo_list_remastered.write(output_format="tsv", file=output_tsv_file) #TODO save as fasta all_filtered_repeats += len(denovo_list_remastered.repeats) print("\n***", seq_name, "***") print("denovo repeats:", len(denovo_list.repeats)) print("repeats after filtering and clustering:", len(denovo_list_remastered.repeats)) for i in range(len(denovo_list_remastered.repeats)): print(denovo_list_remastered.repeats[i]) return print("\nThere where {} repeats found de novo.".format(all_denovo_repeats), "After filtering and clustering there where only {} repeats left.\n".format( all_filtered_repeats))
from pyfaidx import Fasta # dowload proteinsequence from: https://www.uniprot.org/uniprot/P98179 ########################################################################## # From .fasta to Class Sequence sequences proteins = Fasta( "/home/matteo/polybox/MSc_ACLS/master_thesis/data/test_protein.fasta") all_denovo_repeats = 0 all_filtered_repeats = 0 for pyfaidx in proteins: seq_name = pyfaidx.name.split("|")[1] # name is protein identifier seq = sequence.Sequence(seq=str(pyfaidx), name=seq_name) # Saving this sequences as binary files: # with open(sequence_pkl, 'wb') as f: # pickle.dump(seq, f) ########################################################################## # Getting TRs denovo_list = seq.detect(denovo=True) # TODO: reinstall tral with all phylo files! for TR in denovo_list.repeats: TR.calculate_pvalues() print(TR) # Saving this sequences as binary files:
def test_detect_HHrepID(): test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0) predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["HHrepID"])[0]['HHrepID'] assert len(predicted_repeats) == 1 assert predicted_repeats[0].msa == ['------------------IPTCAEAGEQ----', 'EGRLQRKQKNATGGRRHICHECGKSFAQ----', 'SSGLSKHRRIHTGEKPYECEECGKAFIG----', 'SSALVIHQRVHTGEKPYECEECGKAFSH----', 'SSDLIKHQRTHTGEKPYECDDCGKTFSQ----', 'SCSLLEHHRIHTGEKPYQCSMCGKAFRR----', 'SSHLLRHQRIHTGDKNVQEPEQGEAWKSRMES', '------QLENVETPMSYKCNECERSFTQ----', 'NTGLIEHQKIHTGEKPYQCNACGKGFTR----', 'ISYLVQHQRSHVG-------------------']
def test_detect_TREKS(): test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0) predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["T-REKS"])[0]['T-REKS'] assert len(predicted_repeats) == 1 assert predicted_repeats[0].msa == ['C---G---KSFAQSSGLSKHRRIHTGEKPYECE-E', 'C---G---KAFIGSSALVIHQRVHTGEKPYECE-E', 'C---G---KAFSHSSDLIKHQRTHTGEKPYECD-D', 'C---G---KTFSQSCSLLEHHRIHTGEKPYQCS-M', 'C---G---KAFRRSSHLLRHQRIHTGDKNVQ-EPE', 'Q---G---EAW--KSRME-SQ-LENVETPMSYK--', 'C---NECERSFTQNTGLIEHQKIHTGEKPYQ----', 'CNACG---KGFTRISYLVQHQRSHVG-KNI-LS--']
def test_detect_XSTREAM(): test_seq = sequence.Sequence(TEST_SEQUENCE_Q9BRR0) predicted_repeats = repeat_detection_run.run_detector(seq_records = [test_seq], detectors = ["XSTREAM"])[0]['XSTREAM'] assert len(predicted_repeats) == 1 assert predicted_repeats[0].msa == ['ECGKSFAQS-SGLSK-HRRIHTGEKPYECE', 'ECGKAFIGS-SALVI-HQRVHTGEKPYECE', 'ECGKAFSHS-SDL-IKHQRTHTGEKPYECD', 'DCGKTFSQSCSLLEH-H-RIHTGEKPY']