def do(sequences, query_seq, position, pattern, changed_pattern, SW=0): '''Finds the difference in the entropy in patterns when a pattern is changed to changed pattern. ''' # case 1: for original amino acid sequences.append(query_seq) patterns = check_mutation_position.do(sequences, position) if SW: sw = landgraf_sequence_weights.do(sequences) else: sw = list(np.ones(len(sequences))) entropy1 = calculate_sop(patterns, sw) # case 2: for mutant amino acid sequences.pop() new_query_seq = util.strsub(query_seq, position, changed_pattern) sequences.append(new_query_seq) patterns = check_mutation_position.do(sequences, position) if SW: sw = landgraf_sequence_weights.do(sequences) else: sw = list(np.ones(len(sequences))) entropy2 = calculate_sop(patterns, sw) return (entropy2 - entropy1)
def do(file1, file2, winsize): mutations = [] with open(file1) as fp: for line in fp: mutations.append(line.strip().split(',')) with open(file2, 'w') as ofp: for mut in mutations: print mut flag = 0 protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein) alfile = '../alignments/%s' % protein if not os.path.isfile(alfile): continue proteins = load_alignments.do(alfile) proteins = util.prune_proteins_list(proteins) query, proteins = util.fetch_query_protein_in_alignments(proteins) query_seq = query['alignment'] alignments = [a['alignment'] for a in proteins] try: sequence_weights = landgraf_sequence_weights.do(alignments) except Exception as e: print str(e) flag = 1 if flag: continue scores = [] for w in range(winsize + 1): # 0,1,2,3 for winsize=3, hence the +1 try: # what score to use? if w == 0: mod_pos = correct_alignment_position.do( query_seq, position) aa = check_mutation_position.do(alignments, mod_pos) o, m = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # mod_pos scores.append(m) else: mod_pos = correct_alignment_position.do( query_seq, position - w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position - w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # left neighbour at position w from mutation position mod_pos = correct_alignment_position.do( query_seq, position + w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position + w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # right neighbour at position w from mutation position except Exception: flag = 1 break if flag: continue #print(scores) ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ]) + '\n')
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) ofp = open(file2, 'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # prune the proteins alignments = util.prune_proteins_list(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # fetch the corresponding sequences sequences = [a['alignment'] for a in alignments] # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # calculate shannon entropy score w/o sequence weights shannon = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa) # shannon entropy score with sequence weights shannon_weighted = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # calculate von-neumann entropy score von_neumann_score = von_neumann_entropy_score.do( list(aa), orig_aa, mut_aa) # calculate sum-of-pairs scores sop = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 0) # wo seq wg sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # w seq wg # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score, shannon, shannon_weighted, von_neumann_score, sop, sop_wg ] #print(scores) ofp.write(','.join( [str(item) for item in list(chain.from_iterable([mut, scores]))]) + '\n') #break ofp.close()
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) # ofp = open(file2,'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # remove duplicate sequences in aligned sequences alignments = uniquify_alignments.do(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score ] print(scores) # ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n') break