def borodovsky_blosum_50_2(): seqs = sequence.readFastaFile("./files/simple_seqs/borodovsky.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.borodovsky_4_7, [profile1, profile2], sub_matrix.blosum62LatestProbs, log_transform=False) return phmm
def durbin_blosum_50_2(): seqs = sequence.readFastaFile("./files/simple_seqs/durbin_2.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum50, log_transform=True) return phmm
def probabilities_blosum_62_2(): seqs = sequence.readFastaFile("./files/simple_seqs/simple_2.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum62LatestProbs, log_transform=True) return phmm
def two_col_62_2(): seqs = sequence.readFastaFile("./files/custom_seqs/2_col.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.basic_params, [profile1, profile2], sub_matrix.blosum62, log_transform=True) return phmm
def ox_104t17_1(): seqs = sequence.readFastaFile( "./files/qscore_corrections/ox_104t17_1.fasta") profile1 = aln_profile.AlignmentProfile([seqs[0]]) profile2 = aln_profile.AlignmentProfile([seqs[1]]) phmm = align.load_params(params.qscore_params, [profile1, profile2], sub_matrix.blosum62EstimatedWithX, log_transform=True) return phmm
def test_profile_with_two_sequences(): profile_2 = aln_profile.AlignmentProfile(['RTAG', '-TA-']) assert profile_2.profile[1]['T'] == 2
def test_profile_with_one_sequence(): profile_1 = aln_profile.AlignmentProfile(['RTAG']) assert profile_1.profile[0]['R'] == 1
def align_seqs(inpath, outpath, aln_type, params=parameters.basic_params, subsmat=sub_matrix.blosum62EstimatedWithX_dict, log_transform=True): print("params are") print(params) # Read sequences in seqs = sequence.readFastaFile(inpath, alphabet=Protein_Alphabet_wB_X_Z) print(len(seqs)) if len(seqs) == 2: aln_order = [("N0", [seqs[0].name, seqs[1].name])] else: # Calculate guide tree guide_tree = gt.get_guide_tree(seqs, random=False) print(guide_tree.ascii_art()) # Get the alignment order aln_order = gt.get_aln_order(guide_tree) # print (aln_order) print(aln_order) seq_dict = {x.name: x for x in seqs} # Predecessors start off blank predecessors = [{}, {}] # Create alignment in order from guide tree for node in aln_order: # Get the current node name and list of sequences under that node curr_node = node[0] curr_seqs = node[1] # List to store the aligned sequences in aligned = [] # While the node has sequences underneath yet to be aligned while curr_seqs: # Get a sequence seq = curr_seqs.pop() # Make it into a profile if it isn't one already if type(seq_dict[seq]) != aln_profile.AlignmentProfile: profile = aln_profile.AlignmentProfile([seq_dict[seq]]) else: profile = seq_dict[seq] # Add sequence to the aligned list aligned.append(profile) # if len(alns) > 1: # new_align = "-align-".join(alns) # alns = [] # alns.append(new_align) # If we have two profiles it is time to align if len(aligned) > 1: pair_hmm = load_params(params, aligned, subsmat, log_transform, predecessors) if aln_type == 'viterbi': pair_hmm.performViterbiAlignment(po=False) aligned_profile = pair_hmm.get_alignment( type_to_get='viterbi') elif aln_type == 'poviterbi': pair_hmm.performViterbiAlignment(po=True) aligned_profile = pair_hmm.get_alignment( type_to_get='viterbi') elif aln_type == 'mea': pair_hmm.performMEAAlignment(po=False) aligned_profile = pair_hmm.get_alignment(type_to_get='mea') elif aln_type == 'pomea': pair_hmm.performMEAAlignment(po=True) aligned_profile = pair_hmm.get_alignment(type_to_get='mea') # Clear the previous unaligned sequences aligned = [] # Add the aligned sequences aligned.append(aligned_profile) # print ('wowza') # print (aligned[0]) # print(aligned[0].predecessors) seq_dict[curr_node] = aligned[0] # print('alignment is ') # print(aligned_profile) with open(outpath, 'w') as outfile: outfile.write(str(aligned_profile)) return aligned_profile
def run_qscore(name, aln_type, parameters, specific_files=None, save=False, outpath=""): base_dir = "./bench1.0/" + name in_dir = base_dir + "/in/" ref_dir = base_dir + "/ref/" out_dir = "./qscore_alignments/" + aln_type + "_" + name qscore_dict = defaultdict(dict) files = os.listdir(in_dir) file_count = 0 start_time = timeit.default_timer() now = datetime.now() dt_string = now.strftime("%Y/%m/%d_%H:%M") # Add trailing slash to output directory if it isn't there outpath = outpath + "/" if outpath[-1] != "/" else outpath param_name = f"t={parameters['tau']}e={parameters['epsilon']}d={parameters['delta']}x={parameters['emissionX']}y={parameters['emissionY']}" output_file = "./qscore_alignments/" + aln_type + "_" + name + param_name + ".csv" if os.path.exists(outpath + name + ".p"): curr_dict = pickle.load(open(outpath + name + ".p", "rb")) else: curr_dict = {param_name: {}} if os.path.exists(outpath + name + "_best.p"): best_dict = pickle.load(open(outpath + name + "_best.p", "rb")) else: best_dict = {} if os.path.exists(outpath + "time.p"): time_dict = pickle.load(open(outpath + "time.p", "rb")) else: time_dict = {} failures = [] with open(output_file, 'w+') as output: writer = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['Tool', 'Dataset', 'Name', 'Q', 'TC', 'M', 'C']) # If we don't already have a directory created to save the alignments, lets make one if not os.path.exists(out_dir): os.makedirs(out_dir) for file in files: failed = False if file != ".DS_Store": seqs = sequence.readFastaFile(in_dir + file, alphabet=Protein_Alphabet_wB_X_Z) for seq in seqs: if any(skip in seq.sequence for skip in aa_skip): print("failed on " + seq.name) failures.append(file) failed = True if not failed: qscore_dict[file] = defaultdict(dict) if not specific_files or file in specific_files: if param_name not in curr_dict: curr_dict[param_name] = {} # print (curr_dict) file_count += 1 single_time = timeit.default_timer() print(file) # change_params = {'tau': 0.000002, 'epsilon': 0.0001, 'delta': 0.0002, 'emissionX': 0.2, 'emissionY': # 0.2} # change_params = {'tau': 0.00000000002, 'epsilon': 0.000175, 'delta': 0.00031, 'emissionX': # 0.002, # 'emissionY': # 0.002} # # change_params = {'tau': 0.1, 'epsilon': 0.02, 'delta': 0.01, 'emissionX': # 0.5, # 'emissionY': # 0.5} # Update parameters using Baum Welch for seq_order in list(itertools.combinations(seqs, 2)): profiles = [ aln_profile.AlignmentProfile([x]) for x in seq_order ] # change_params = bw.runBaumWelch(parameters, profiles, aln_type) print(parameters) # print (change_params) aligned_profile = align.align_seqs( in_dir + file, out_dir + "/" + file + ".aln", aln_type=aln_type, params=parameters, subsmat=sub_matrix.blosum62EstimatedWithX_dict, log_transform=log_transform) process = subprocess.Popen( "qscore -test %s -ref %s -cline -modeler" % (out_dir + "/" + file + ".aln", ref_dir + file), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) out = process.communicate()[0] errcode = process.returncode print('running') print(errcode) scores = [ x.strip() for x in out.decode('utf-8').split(";")[2:] ] # scores = [x.split("=")[1] for x in scores] # print (aligned_profile) print(file) print('\nScores be') print(scores) for score in scores: score_type = score.split("=")[0].strip() score_value = score.split("=")[1].strip() qscore_dict[file][score_type] = score_value curr_dict[param_name][file] = (scores, aligned_profile) update_best_dict(best_dict, file, scores, param_name) if scores and "=" in scores[0]: writer.writerow([ aln_type + "_" + param_name + "_log=" + str(log_transform), name, file, scores[0].split("=")[1], scores[1].split("=")[1], scores[2].split("=")[1], scores[3].split("=")[1] ]) else: failures.append(file) # if file not in curr_dict[param_name].keys(): # curr_dict[param_name][file] = (scores, aligned_profile) # else: # curr_dict[param_name][file] = (scores, aligned_profile) # total_seconds = timeit.default_timer() - start_time single_seconds = timeit.default_timer() - single_time if save: pickle.dump( curr_dict, open(outpath + aln_type + "_" + name + ".p", "wb")) pickle.dump( best_dict, open( outpath + aln_type + "_" + name + "_best.p", "wb")) if save: if name in time_dict: if total_seconds < time_dict[name][0]: time_dict[name] = (total_seconds, dt_string) print("New best time - " + utilities.format_time(total_seconds)) else: time_dict[name] = (total_seconds, dt_string) print("New best time - " + utilities.format_time(total_seconds)) pickle.dump( time_dict, open(outpath + aln_type + "_" + "time.p", "wb")) print('These files failed ') print(failures) return qscore_dict
'epsilon': 0.05, 'delta': 0.02, 'emissionX': 0.92, 'emissionY': 0.2 } change_params = { 'tau': 0.002, 'epsilon': 0.05, 'delta': 0.02, 'emissionX': 0.5, 'emissionY': 0.5 } for seq_order in list(itertools.combinations(seqs, 2)): profiles = [aln_profile.AlignmentProfile([x]) for x in seq_order] # change_params = bw.runBaumWelch(change_params, profiles, "viterbi") alignment = align.align_seqs(seq, "../../tests/files/custom_seqs/3_col.aln", aln_type='mea', params=change_params, subsmat=sub_matrix.blosum62EstimatedWithX_dict, log_transform=True) po_alignment = align.align_seqs(seq, "../../tests/files/custom_seqs/3_col.aln", aln_type='pomea', params=change_params, subsmat=sub_matrix.blosum62EstimatedWithX_dict,