def main(): filename = '../alignments/Q92481' position = 72 changed_pattern = 'R' import load_alignments import uniquify_alignments import check_mutation_position import correct_alignment_position proteins = load_alignments.do(filename) sorted_list = sorted(proteins, key=itemgetter('match_percentage')) proteins = sorted_list proteins.reverse() proteins = uniquify_alignments.do(proteins) query, prots = util.fetch_query_protein_in_alignments(proteins) mod_position = correct_alignment_position.do(query['alignment'], position) pattern = query['alignment'][mod_position] patterns = check_mutation_position.do( [prot['alignment'] for prot in proteins], mod_position) print(do(patterns, pattern, changed_pattern))
def do(file1, file2, winsize=1): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append([parts[0],parts[1],parts[2],parts[3],parts[4]]) # ofp = open(file2,'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s'%protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # remove duplicate sequences in aligned sequences alignments = uniquify_alignments.do(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue o_win_score = 0 for w in range(winsize+1): scores = [o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score] print(scores) #ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n') break #ofp.close() def main(): args = sys.argv if len(args) > 1: f1 = args[1] # where mutation info is present f2 = args[2] # where we store the scores winsize = args[3] else: f1 = 'datasets/test/unseen_proteins.csv' f2 = 'datasets/test/unseen_proteins_freq_scores.csv' winsize = 3 do(f1, f2, winsize) if __name__ == '__main__': main()
def do(file1, file2, nbsize): mutations = [] with open(file1) as fp: for line in fp: mutations.append(line.strip().split(',')) mutations.reverse() with open(file2,'w') as ofp: # --------------------------------------------------------- # write the heading to the output file heading = ['Protein','Position','Orig_aa','Mut_aa','Label'] # neighbour part for n in range(1, nbsize+1): for i in range(n+1): if n > 3: maxhd = 2 elif n > 2: maxhd = 1 else: maxhd = 0 for k in range(maxhd+1): # hamming distance k heading.append('nbwidth:%s srtpos:%s hd:%s orig' %(str(n), str(-i), str(k))) heading.append('nbwidth:%s srtpos:%s hd:%s mut' %(str(n), str(-i), str(k))) #print heading ofp.write(','.join(heading)+'\n') # --------------------------------------------------------- # compute scores for mutations for mut in mutations: protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] alignment_path = '%s/%s' %(aligndir, protein) if not os.path.exists(alignment_path): continue #print(mut) query_flag = 0 # to capture IndexError in query protein sequence # fetch alignments, and query protein alignments = load_alignments.do(alignment_path) query, alignments = util.fetch_query_protein_in_alignments(alignments) alignments = util.prune_proteins_list(alignments) # fetch all sequences sequences = [a['alignment'] for a in alignments] query_seq = query['alignment'] # all neighbour scores scores = [] # go into neighbour mode now for n in range(1, nbsize+1): if n > 3: maxhd = 2 elif n > 2: maxhd = 1 else: maxhd = 0 for i in range(n+1): # i is the relative position of mutation position # in nb_pos list start_pos = position - i # fetch the indices of neighbours for this round nb_pos = [] try: for j in range(n+1): # correct the position in the alignment cpos = correct_alignment_position.do(query_seq, start_pos+j) nb_pos.append(cpos) except Exception: query_flag = 1 break # fetch corresponding positions of aligned sequences seq_parts = [] for seq in sequences: s = list(seq) row = [] flag = 0 for pos in nb_pos: try: # do correction for 'B' and 'Z' amino acids if s[pos] == 'B': row.append('N') elif s[pos] == 'Z': row.append('Q') else: row.append(s[pos]) except IndexError: flag = 1 break if flag: continue seq_parts.append(row) if len(seq_parts) == 0: query_flag = 1 break if query_flag: break # fetch corresponding positions of query sequence query_seq_part = [] s = list(query_seq) for pos in nb_pos: try: # do correction for 'B' and 'Z' amino acids if s[pos] == 'B': query_seq_part.append('N') elif s[pos] == 'Z': query_seq_part.append('Q') else: query_seq_part.append(s[pos]) except IndexError: query_flag = 1 break if query_flag: break # construct the mutated query sequence's parts mut_query_seq_part = list(query_seq_part) mut_query_seq_part[i] = mut_aa for k in range(maxhd+1): #print('%s %s %s - original' %(str(n),str(i),str(k))) # compute hd-k orig score scores.append(calc_hd_score(seq_parts, query_seq_part, i, k)) #print('%s %s %s - mutant' %(str(n),str(i),str(k))) # compute hd-k mut score scores.append(calc_hd_score(seq_parts, mut_query_seq_part, i, k)) if query_flag: break if query_flag: continue #print scores ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')
orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue # load the aligned sequences alignments = load_alignments.do(alignment_file) # prune the proteins alignments = util.prune_proteins_list(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # fetch the corresponding sequences sequences = [a['alignment'] for a in alignments] # compute frequency of amino acid at desired position
def work(line): '''This method computes all kinds of scores, for a given mutation, and returns the computed scores. Args: line: which contains mutation info. <PROTEIN_ID, POSITION, ORIGINAL_AMINO_ACID, MUTANT_AMINO_ACID> Return Values: [parts, scores] flattend list ''' line = line.strip() parts = line.split(',') protein = parts[0] position = int(parts[1]) - 1 # correction for indexing python lists mut_aa = parts[3] result = [] print(protein) for alignment_dir in align_dirs: alignment_file = os.path.join(alignment_dir, protein) if not os.path.isfile(alignment_file): return [] fasta = util.read_sequence('../fasta/%s.fasta' % protein) proteins = load_alignments.do( alignment_file) # types of proteins required # sort & prune the list of proteins proteins = prune_proteins_list(proteins) # fetch the record pertaining to the query protein query, p = util.fetch_query_protein_in_alignments( proteins) # p does not contain query sequence prots = [prot["id"] for prot in proteins] types = [prot["type"] for prot in proteins] match_percents = [prot["match_percentage"] for prot in proteins] alignments = [prot["alignment"] for prot in proteins] proteins = prots #replacing proteins dicitonary with only ids result.append(len(prots) + 1) orig_aa = fasta[position] # correct the position of query protein wrt to alignment mod_pos = correct_alignment_position.do(query['alignment'], position) aa = check_mutation_position.do(alignments, mod_pos) # calculate shannon entropy score w/o sequence weights result.append( shannon_entropy_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa)) # shannon entropy score with sequence weights result.append( shannon_entropy_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 1)) # calculate von-neumann entropy score result.append(von_neumann_entropy_score.do(list(aa), orig_aa, mut_aa)) # calculate relative entropy score #result.append(relative_entropy_score.do(list(aa), orig_aa, mut_aa)) # calculate jensen-shannon divergence score #result.append(jensen_shannon_divergence_score(list(aa), orig_aa, mut_aa)) # calculate sum-of-pairs scores result.append( sum_of_pairs_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 0)) # wo seq wg result.append( sum_of_pairs_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 1)) # w seq wg # return mutation information and scores for recording in a file return [str(item) for item in list(chain.from_iterable([parts, result]))]
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) ofp = open(file2, 'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # prune the proteins alignments = util.prune_proteins_list(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # fetch the corresponding sequences sequences = [a['alignment'] for a in alignments] # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # calculate shannon entropy score w/o sequence weights shannon = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa) # shannon entropy score with sequence weights shannon_weighted = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # calculate von-neumann entropy score von_neumann_score = von_neumann_entropy_score.do( list(aa), orig_aa, mut_aa) # calculate sum-of-pairs scores sop = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 0) # wo seq wg sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # w seq wg # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score, shannon, shannon_weighted, von_neumann_score, sop, sop_wg ] #print(scores) ofp.write(','.join( [str(item) for item in list(chain.from_iterable([mut, scores]))]) + '\n') #break ofp.close()
def do(file1, file2, winsize): mutations = [] with open(file1) as fp: for line in fp: mutations.append(line.strip().split(',')) with open(file2, 'w') as ofp: for mut in mutations: print mut flag = 0 protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein) alfile = '../alignments/%s' % protein if not os.path.isfile(alfile): continue proteins = load_alignments.do(alfile) proteins = util.prune_proteins_list(proteins) query, proteins = util.fetch_query_protein_in_alignments(proteins) query_seq = query['alignment'] alignments = [a['alignment'] for a in proteins] try: sequence_weights = landgraf_sequence_weights.do(alignments) except Exception as e: print str(e) flag = 1 if flag: continue scores = [] for w in range(winsize + 1): # 0,1,2,3 for winsize=3, hence the +1 try: # what score to use? if w == 0: mod_pos = correct_alignment_position.do( query_seq, position) aa = check_mutation_position.do(alignments, mod_pos) o, m = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # mod_pos scores.append(m) else: mod_pos = correct_alignment_position.do( query_seq, position - w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position - w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # left neighbour at position w from mutation position mod_pos = correct_alignment_position.do( query_seq, position + w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position + w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # right neighbour at position w from mutation position except Exception: flag = 1 break if flag: continue #print(scores) ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ]) + '\n')
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) # ofp = open(file2,'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # remove duplicate sequences in aligned sequences alignments = uniquify_alignments.do(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score ] print(scores) # ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n') break