def match_seq_to_best_template_seq(tgt_seq: Sequence, templates: List[Tuple[str, Sequence]]) -> List[Tuple[str, Sequence]]: print("\nMatching desired target sequence to our template library...") # Filter sequences for those of same true length. If 1, done. # If zero, quit entirely; no point. new_seqs: List[Tuple[str, Sequence]] = [t for t in templates] ii: int = 0 pdb_len: int = len(new_seqs[0][1].sequence.replace('-','')) # HOLY F**K TRY SOME ALIGNMENTS YOU MORON seq_matching_to_pdb: Dict[str, int] = {} for ii, (pp, new_seq) in enumerate(new_seqs): # s is tgt_seq, but expanded based on new_seq's dash pattern s: str = import_dash_pattern(already_dashed_seq=new_seq, dest_seq_str=tgt_seq.sequence) seq_matching_to_pdb[new_seq.sequence] = simple_match(Sequence(s), new_seq, quiet=False) # Temp: we don't want only the best score, but maybe the top five. #best_score: int = max(seq_matching_to_pdb.values()) #new_seqs = [(p, s) for (p, s) in new_seqs if s.sequence in seq_matching_to_pdb.keys() and seq_matching_to_pdb[s.sequence] == best_score] best_score: int = sorted(list(set(seq_matching_to_pdb.values())))[-20] new_seqs = [(p, s) for (p, s) in new_seqs if s.sequence in seq_matching_to_pdb.keys() and seq_matching_to_pdb[s.sequence] >= best_score] print("There are {n} sequences that match your template sequence to a score of {score}:".format(n=len(new_seqs), score=best_score)) for ii, (p, q) in enumerate(new_seqs): print("\tSequence {serial} -- score {score}\n\t\tPDB: {pdb}\n\t\tSEQ: {seq}\n" .format(serial=ii, score=seq_matching_to_pdb[q.sequence], pdb=p, seq=q.sequence)) return new_seqs
def get_seqs(fn: str) -> List[Tuple[str, Sequence]]: lines = [] # type: List[str] with open(fn) as f: lines = f.readlines() #sequences = {l.strip().split()[0]: l.strip().split()[1] for l in lines if "ALIGNED_TO" in l } sequences = [(l.strip().split()[0], Sequence(l.strip().split()[1])) for l in lines if "PDB_SEQ" in l] return sequences
def get_seqs_mfa(fn: str) -> List[Tuple[str, Sequence]]: """ Get sequences out of the tRNAdb mfa """ lines = [] # type: List[str] with open(fn) as f: lines = f.readlines() sequences = [('', Sequence(l.strip())) for l in lines[1::2]] return sequences
def test_simple_match_identity(): """ aaa """ assert (simple_match(Sequence('A-'), Sequence('A-')) == 10) assert (simple_match(Sequence('G-'), Sequence('G-')) == 10) assert (simple_match(Sequence('GGAA---'), Sequence('GGAA---')) == 35)
def modomics_from_pdb(pdb: str) -> Sequence: """ Doesn't pair with secstruct (yet). dssr? """ pdblines = [] # type: List[str] modomics_seq = "" with open(pdb) as f: pdblines = f.readlines() for l in pdblines: if " C4'" in l: modomics_seq += mod_from_tlc(l[17:20]) # Replace with Rosetta-native SS determination? return Sequence(modomics_seq, '.' * len(modomics_seq))
def import_dash_pattern(already_dashed_seq: Sequence, dest_seq_str: str) -> str: """ Put all the dashes from dashed_seq into other_seq. Try to be a LITTLE clever here. We don't want to just shove every in there in case there is an insertion. Maybe we should test each insertion to see if it improves alignment. Importantly, we have a maximum length to contend with... """ dash_pos: List[int] = dash_positions(already_dashed_seq.sequence) possible_dest_seq_strs: List[str] = seqs_with_dashes(dash_pos, dest_seq_str, len(already_dashed_seq)) score: int = None revised_seq_str: str = "" for possible_dest_seq_str in possible_dest_seq_strs: possible_dest_seq: Sequence = Sequence(possible_dest_seq_str) newscore: int = simple_match(possible_dest_seq, already_dashed_seq, quiet=True) if score is None or newscore > score: score = newscore revised_seq_str = possible_dest_seq_str return revised_seq_str + (len(already_dashed_seq) - len(revised_seq_str)) * '-'
def main(args): if args.pdb is not None: pass import os if not os.path.exists(my_loc() + "/data/all_trna_structure_seqs.dat"): print("Regenerating aligned template library") align_template_library(my_loc() + "/data/all_trna.mfa") templates = get_seqs(my_loc() + "/data/all_trna_structure_seqs.dat") tgt_seq = Sequence("", ".(((((((..((((...........)))).(((((.......)))))........................(((((.......))))))))))))....") with open(args.seq_file[0]) as f: tgt_seq.sequence = f.readlines()[0].strip() pdb_seq_list = match_seq_to_best_template_seq(tgt_seq, templates) # Maybe there are many returned! That's cool; do them all for p, s in pdb_seq_list: if 'a' in tgt_seq.sequence and 'g' in tgt_seq.sequence and 'c' in tgt_seq.sequence and 'u' in tgt_seq.sequence: # annotated seq format, must translate first. tgt_seq = ann_to_mod(tgt_seq) remodel_new_sequence(s, tgt_seq, p, args.nstruct, '', args.defer, args.aggressive)
def add_dash_recursive(template: Sequence, trial: str, dashes, current_best): """ Not in current use -- this is a very expensive function that is good for aligning very difficult sequences. At the moment we have been using a few manual tweaks after automated alignment and that has been good enough. """ def filled_trial_seq(trial: str, ii: int) -> str: return trial[:ii]+'-'+trial[ii:]+'-'*(len(template)-len(trial)-1) n_left_to_add = len(template)-len(trial) # Construct if len(dashes) == n_left_to_add: # We have enough. complete_trial_string = "-"*len(template) trial_index = 0 for complete_index in range(len(template)-1): if complete_index in dashes: continue else: complete_trial_string = complete_trial_string[:complete_index]+trial[trial_index]+complete_trial_string[complete_index+1:] trial_index += 1 if trial_index == len(trial): break score = simple_match(template, Sequence(complete_trial_string)) if current_best is None or score > current_best[1]: current_best = (complete_trial_string, score) else: for ii in range(len(template)): if ii in dashes: continue else: new_dashes = list(dashes) new_dashes.append(ii) current_best = add_dash_recursive(template, trial, new_dashes, current_best) return current_best
def test_import_dash_pattern_dashes(): """ Does this make sense? """ assert (import_dash_pattern(Sequence('AA---', '....'), 'G-G') == 'G---G')
def test_import_dash_pattern_end(): assert (import_dash_pattern(Sequence('AA---', '....'), 'GG') == 'GG---')
def test_import_dash_pattern_beginning(): assert (import_dash_pattern(Sequence('---AA', '....'), 'GG') == '---GG')
def test_import_dash_pattern_middle(): assert (import_dash_pattern(Sequence('A--A', '....'), 'GG') == 'G--G')
def test_simple_match_mod_U_eq(): assert (simple_match(Sequence('U-'), Sequence('T-')) == 8)
def test_simple_match_pyrimidine_eq(): assert (simple_match(Sequence('U-'), Sequence('C-')) == 7)
def test_simple_match_purine_eq(): assert (simple_match(Sequence('A-'), Sequence('G-')) == 7)
def test_simple_match_gap_penalty(): assert (simple_match(Sequence('A-'), Sequence('-A')) == -20)