예제 #1
0
def main():
    filename = '../alignments/Q92481'
    position = 72
    changed_pattern = 'R'

    import load_alignments
    import uniquify_alignments
    import check_mutation_position
    import correct_alignment_position

    proteins = load_alignments.do(filename)
    sorted_list = sorted(proteins, key=itemgetter('match_percentage'))
    proteins = sorted_list
    proteins.reverse()

    proteins = uniquify_alignments.do(proteins)
    query, prots = util.fetch_query_protein_in_alignments(proteins)

    mod_position = correct_alignment_position.do(query['alignment'], position)
    pattern = query['alignment'][mod_position]

    patterns = check_mutation_position.do(
        [prot['alignment'] for prot in proteins], mod_position)

    print(do(patterns, pattern, changed_pattern))
예제 #2
0
def do(file1, file2, winsize=1):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append([parts[0],parts[1],parts[2],parts[3],parts[4]])

#    ofp = open(file2,'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s'%protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # remove duplicate sequences in aligned sequences
        alignments = uniquify_alignments.do(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'], position)
        except Exception as e:
            print(query['alignment'])
            continue

	o_win_score = 0
	for w in range(winsize+1):
	    
        
	
	scores = [o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score]
        print(scores)

	#ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')

        break

    #ofp.close()

def main():
    args = sys.argv

    if len(args) > 1:
        f1 = args[1] # where mutation info is present
        f2 = args[2] # where we store the scores
	winsize = args[3]

    else:
        f1 = 'datasets/test/unseen_proteins.csv'
        f2 = 'datasets/test/unseen_proteins_freq_scores.csv'
	winsize = 3

    do(f1, f2, winsize)


if __name__ == '__main__':
    main()
예제 #3
0
def do(file1, file2, nbsize):
    mutations = []
    with open(file1) as fp:
	for line in fp:
	    mutations.append(line.strip().split(','))

    mutations.reverse()

    with open(file2,'w') as ofp:

	# ---------------------------------------------------------
	# write the heading to the output file

	heading = ['Protein','Position','Orig_aa','Mut_aa','Label']
	# neighbour part
	for n in range(1, nbsize+1):
	    for i in range(n+1):
		if n > 3:
		    maxhd = 2
		elif n > 2:
		    maxhd = 1
		else:
		    maxhd = 0

		for k in range(maxhd+1):
		    # hamming distance k
		    heading.append('nbwidth:%s srtpos:%s hd:%s orig' %(str(n), str(-i), str(k)))
		    heading.append('nbwidth:%s srtpos:%s hd:%s mut' %(str(n), str(-i), str(k)))

	#print heading
	ofp.write(','.join(heading)+'\n')

	# ---------------------------------------------------------
	# compute scores for mutations

	for mut in mutations:
	    protein = mut[0]
	    position = int(mut[1]) - 1
	    orig_aa = mut[2]
	    mut_aa = mut[3]

	    alignment_path = '%s/%s' %(aligndir, protein)
	    if not os.path.exists(alignment_path):
		continue

	    #print(mut)
	    query_flag = 0 # to capture IndexError in query protein sequence

	    # fetch alignments, and query protein
	    alignments = load_alignments.do(alignment_path)
	    query, alignments = util.fetch_query_protein_in_alignments(alignments)
	    alignments = util.prune_proteins_list(alignments)

	    # fetch all sequences
	    sequences = [a['alignment'] for a in alignments]
	    query_seq = query['alignment']
 
	    # all neighbour scores
	    scores = []

	    # go into neighbour mode now
	    for n in range(1, nbsize+1):

		if n > 3:
		    maxhd = 2
		elif n > 2:
		    maxhd = 1
		else:
		    maxhd = 0

		for i in range(n+1):

		    # i is the relative position of mutation position
		    # in nb_pos list

		    start_pos = position - i

		    # fetch the indices of neighbours for this round
		    nb_pos = []
		    try:
		    	for j in range(n+1):
			    # correct the position in the alignment
			    cpos = correct_alignment_position.do(query_seq, start_pos+j)
			    nb_pos.append(cpos)
		    except Exception:
			query_flag = 1
			break

		    # fetch corresponding positions of aligned sequences
		    seq_parts = []
		    for seq in sequences:
			s = list(seq)
			row = []
			flag = 0
			for pos in nb_pos:
			    try:
				# do correction for 'B' and 'Z' amino acids
				if s[pos] == 'B':
				    row.append('N')
				elif s[pos] == 'Z':
				    row.append('Q')
				else:
				    row.append(s[pos])
			    except IndexError:
				flag = 1
				break
			if flag:
			    continue
			seq_parts.append(row)

	   		if len(seq_parts) == 0:
			    query_flag = 1
			    break

		    if query_flag:
			break			
 
		    # fetch corresponding positions of query sequence
		    query_seq_part = []
		    s = list(query_seq)
		    for pos in nb_pos:
			try:
			    # do correction for 'B' and 'Z' amino acids
			    if s[pos] == 'B':
				query_seq_part.append('N')
			    elif s[pos] == 'Z':
				query_seq_part.append('Q')
			    else:
				query_seq_part.append(s[pos])
			except IndexError:
			    query_flag = 1
			    break

		    if query_flag:
			break
			    
		    # construct the mutated query sequence's parts
		    mut_query_seq_part = list(query_seq_part)
		    mut_query_seq_part[i] = mut_aa

		    for k in range(maxhd+1):
			#print('%s %s %s - original' %(str(n),str(i),str(k)))
			# compute hd-k orig score
			scores.append(calc_hd_score(seq_parts, query_seq_part, i, k))
			#print('%s %s %s - mutant' %(str(n),str(i),str(k)))
			# compute hd-k mut score
			scores.append(calc_hd_score(seq_parts, mut_query_seq_part, i, k))

		if query_flag:
		    break

	    if query_flag:
		continue

	    #print scores
	    ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')
예제 #4
0
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # prune the proteins
        alignments = util.prune_proteins_list(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # fetch the corresponding sequences
        sequences = [a['alignment'] for a in alignments]

        # compute frequency of amino acid at desired position
예제 #5
0
def work(line):
    '''This method computes all kinds of scores,
    for a given mutation, and returns the computed scores.

    Args:
    line: which contains mutation info.
        <PROTEIN_ID, POSITION, ORIGINAL_AMINO_ACID, MUTANT_AMINO_ACID>

    Return Values:
    [parts, scores] flattend list

    '''

    line = line.strip()
    parts = line.split(',')

    protein = parts[0]
    position = int(parts[1]) - 1  # correction for indexing python lists
    mut_aa = parts[3]

    result = []

    print(protein)

    for alignment_dir in align_dirs:

        alignment_file = os.path.join(alignment_dir, protein)
        if not os.path.isfile(alignment_file):
            return []

        fasta = util.read_sequence('../fasta/%s.fasta' % protein)

        proteins = load_alignments.do(
            alignment_file)  # types of proteins required

        # sort & prune the list of proteins
        proteins = prune_proteins_list(proteins)

        # fetch the record pertaining to the query protein
        query, p = util.fetch_query_protein_in_alignments(
            proteins)  # p does not contain query sequence

        prots = [prot["id"] for prot in proteins]
        types = [prot["type"] for prot in proteins]
        match_percents = [prot["match_percentage"] for prot in proteins]
        alignments = [prot["alignment"] for prot in proteins]
        proteins = prots  #replacing proteins dicitonary with only ids

        result.append(len(prots) + 1)

        orig_aa = fasta[position]
        # correct the position of query protein wrt to alignment
        mod_pos = correct_alignment_position.do(query['alignment'], position)
        aa = check_mutation_position.do(alignments, mod_pos)

        # calculate shannon entropy score w/o sequence weights
        result.append(
            shannon_entropy_score.do(list(alignments), query['alignment'],
                                     mod_pos, orig_aa, mut_aa))
        # shannon entropy score with sequence weights
        result.append(
            shannon_entropy_score.do(list(alignments), query['alignment'],
                                     mod_pos, orig_aa, mut_aa, 1))

        # calculate von-neumann entropy score
        result.append(von_neumann_entropy_score.do(list(aa), orig_aa, mut_aa))

        # calculate relative entropy score
        #result.append(relative_entropy_score.do(list(aa), orig_aa, mut_aa))

        # calculate jensen-shannon divergence score
        #result.append(jensen_shannon_divergence_score(list(aa), orig_aa, mut_aa))

        # calculate sum-of-pairs scores
        result.append(
            sum_of_pairs_score.do(list(alignments), query['alignment'],
                                  mod_pos, orig_aa, mut_aa, 0))  # wo seq wg
        result.append(
            sum_of_pairs_score.do(list(alignments), query['alignment'],
                                  mod_pos, orig_aa, mut_aa, 1))  # w seq wg

    # return mutation information and scores for recording in a file
    return [str(item) for item in list(chain.from_iterable([parts, result]))]
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])

    ofp = open(file2, 'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # prune the proteins
        alignments = util.prune_proteins_list(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # fetch the corresponding sequences
        sequences = [a['alignment'] for a in alignments]

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa,
                                                       mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)

        # calculate shannon entropy score w/o sequence weights
        shannon = shannon_entropy_score.do(list(sequences), query['alignment'],
                                           actual_pos, orig_aa, mut_aa)
        # shannon entropy score with sequence weights
        shannon_weighted = shannon_entropy_score.do(list(sequences),
                                                    query['alignment'],
                                                    actual_pos, orig_aa,
                                                    mut_aa, 1)

        # calculate von-neumann entropy score
        von_neumann_score = von_neumann_entropy_score.do(
            list(aa), orig_aa, mut_aa)

        # calculate sum-of-pairs scores
        sop = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                    actual_pos, orig_aa, mut_aa,
                                    0)  # wo seq wg
        sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                       actual_pos, orig_aa, mut_aa,
                                       1)  # w seq wg

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score, shannon, shannon_weighted,
            von_neumann_score, sop, sop_wg
        ]
        #print(scores)

        ofp.write(','.join(
            [str(item)
             for item in list(chain.from_iterable([mut, scores]))]) + '\n')

        #break

    ofp.close()
예제 #7
0
def do(file1, file2, winsize):
    mutations = []
    with open(file1) as fp:
        for line in fp:
            mutations.append(line.strip().split(','))

    with open(file2, 'w') as ofp:
        for mut in mutations:

            print mut
            flag = 0

            protein = mut[0]
            position = int(mut[1]) - 1
            orig_aa = mut[2]
            mut_aa = mut[3]

            fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein)

            alfile = '../alignments/%s' % protein
            if not os.path.isfile(alfile):
                continue

            proteins = load_alignments.do(alfile)
            proteins = util.prune_proteins_list(proteins)
            query, proteins = util.fetch_query_protein_in_alignments(proteins)
            query_seq = query['alignment']

            alignments = [a['alignment'] for a in proteins]
            try:
                sequence_weights = landgraf_sequence_weights.do(alignments)
            except Exception as e:
                print str(e)
                flag = 1

            if flag:
                continue

            scores = []

            for w in range(winsize + 1):  # 0,1,2,3 for winsize=3, hence the +1
                try:
                    # what score to use?

                    if w == 0:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        o, m = gapped_frequency_score.do(
                            list(aa), orig_aa, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(m)

                    else:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position - w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position - w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # left neighbour at position w from mutation position

                        mod_pos = correct_alignment_position.do(
                            query_seq, position + w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position + w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # right neighbour at position w from mutation position

                except Exception:
                    flag = 1
                    break

            if flag:
                continue

            #print(scores)

            ofp.write(','.join([
                str(item) for item in list(chain.from_iterable([mut, scores]))
            ]) + '\n')
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])


#    ofp = open(file2,'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # remove duplicate sequences in aligned sequences
        alignments = uniquify_alignments.do(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score
        ]
        print(scores)

        #	ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')

        break