def needle_diff(left_struc, right_struc, key, minimal=True): '''Returns a diff between ``left_struc`` and ``right_struc``. If ``left_struc`` and ``right_struc`` are both serializable as arrays, this function will use Needleman-Wunsch sequence alignment to find a minimal diff between them. Otherwise, returns the same result as :func:`keyset_diff`. This function probably shouldn't be called directly. Instead, use :func:`udiff`, which will call :func:`keyset_diff` if appropriate anyway. ''' if type(left_struc) not in (list, tuple): return keyset_diff(left_struc, right_struc, key, minimal) assert type(right_struc) in (list, tuple) a, aleft, aright = needle.needle(left_struc, right_struc, needle_penalty) alignments = needle.backtrack(left_struc, right_struc, a, needle_penalty) can_align = False for aleft, aright in alignments: if aleft[:len(left_struc)] == left_struc: can_align = True break if not can_align: return keyset_diff(left_struc, right_struc, key, minimal) out = [] for k in range(len(aleft)): sub_key = key + [k] if isinstance(aleft[k], needle.Gap): out.append([sub_key, aright[k]]) elif isinstance(aright[k], needle.Gap): out.append([sub_key]) else: out.extend( diff(aleft[k], aright[k], key=sub_key, minimal=minimal, verbose=False)) return out
def fullSeq(path, params): import pickle import queryDevice import needle import random inFile = open( path, 'r') lines = inFile.readlines() inFile.close() out = open(path ,'w') out.write("%s\tseq_id\tseq_sim\n"%lines[0].rstrip('\n')) seqIdDict = {} for line in lines[1:]: elements = line.split('\t') proteinAcc_1 = elements[0] proteinAcc_2 = elements[1] pairName = ('_').join([proteinAcc_1, proteinAcc_2]) try: (seqSim, seqId) = seqIdDict[pairName] out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim)) continue except KeyError: #print "aligning sequences of: %s\t%s"%(proteinAcc_1, proteinAcc_2) pass data = queryDevice.queryDevice("SELECT td.protein_sequence, td.protein_accession FROM target_dictionary td WHERE td.protein_accession IN ('%s')"% "','".join([proteinAcc_1, proteinAcc_2]), params) lkp = {} for entry in data: lkp[entry[1]] = entry[0] try: seq_1 = lkp[proteinAcc_1] seq_2 = lkp[proteinAcc_2] except KeyError: seqIdDict[pairName] = (None, None) out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), None, None)) continue ################################################ # Align the sequences using needle from EMBOSS. needleReport = needle.needle(params['needlepath'], seq_1, seq_2) # Parse the output of the alignment (seqSim, seqId) = needle.parseNeedle(needleReport) seqIdDict[pairName] = (seqSim, seqId) out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim)) out.close()
def needle_diff(left_struc, right_struc, key, minimal=True): '''Returns a diff between ``left_struc`` and ``right_struc``. If ``left_struc`` and ``right_struc`` are both serializable as arrays, this function will use Needleman-Wunsch sequence alignment to find a minimal diff between them. Otherwise, returns the same result as :func:`keyset_diff`. This function probably shouldn't be called directly. Instead, use :func:`udiff`, which will call :func:`keyset_diff` if appropriate anyway. ''' if type(left_struc) not in (list, tuple): return keyset_diff(left_struc, right_struc, key, minimal) assert type(right_struc) in (list, tuple) a, aleft, aright = needle.needle(left_struc, right_struc, needle_penalty) alignments = needle.backtrack(left_struc, right_struc, a, needle_penalty) can_align = False for aleft, aright in alignments: if aleft[:len(left_struc)] == left_struc: can_align = True break if not can_align: return keyset_diff(left_struc, right_struc, key, minimal) out = [] for k in range(len(aleft)): sub_key = key + [k] if isinstance(aleft[k], needle.Gap): out.append([sub_key, aright[k]]) elif isinstance(aright[k], needle.Gap): out.append([sub_key]) else: out.extend(diff(aleft[k], aright[k], key=sub_key, minimal=minimal, verbose=False)) return out
def pfam_a(path, params): import queryDevice import needle inFile = open( path, 'r') lines = inFile.readlines() inFile.close() out = open(path ,'w') out.write("%s\tdom_seq_id\tdom_seq_sim\tpfam_1\tpfam_2\n"%lines[0].rstrip('\n')) seqIdDict = {} for line in lines[1:]: elements = line.split('\t') proteinAcc_1 = elements[0] proteinAcc_2 = elements[1] pairName = ('_').join([proteinAcc_1, proteinAcc_2]) try: (seqSim, seqId, pfam_1, pfam_2) = seqIdDict[pairName] out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2)) continue except KeyError: #print "aligning bs_containing domains of: %s\t%s"%(proteinAcc_1, proteinAcc_2) pass data_1 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a, pd.start, pd.end, td.protein_sequence, td.protein_accession FROM map_pfam mp JOIN pfam_domains pd ON mp.pfam_a = pd.pfam_a JOIN target_dictionary td ON td.protein_accession = mp.protein_accession WHERE mp.protein_accession = '%s' AND pd.protein_accession = '%s' """ % (proteinAcc_1, proteinAcc_1), params) data_2 = queryDevice.queryDevice("""SELECT DISTINCT mp.pfam_a, pd.start, pd.end, td.protein_sequence, td.protein_accession FROM map_pfam mp JOIN pfam_domains pd ON mp.pfam_a = pd.pfam_a JOIN target_dictionary td ON td.protein_accession = mp.protein_accession WHERE mp.protein_accession = '%s' AND pd.protein_accession = '%s' """ % (proteinAcc_2, proteinAcc_2),params) lkp = {} for entry in data_1 + data_2: (pfam, start, end, fullSeq, acc) = (entry[0], entry[1], entry[2], entry[3], entry[4]) seq = fullSeq[start:end] lkp[acc] = (seq, pfam) try: seq_1 = lkp[proteinAcc_1][0] pfam_1 =lkp[proteinAcc_1][1] seq_2 = lkp[proteinAcc_2][0] pfam_2 = lkp[proteinAcc_2][1] except KeyError: out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), None, None, None, None)) continue ################################################ # Align the sequences using needle from EMBOSS. needleReport = needle.needle(params['needlepath'], seq_1, seq_2) ################################################ # Parse the output of the alignment (seqSim, seqId) = needle.parseNeedle(needleReport) seqIdDict[pairName] = (seqSim, seqId, pfam_1, pfam_2) out.write("%s\t%s\t%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim, pfam_1, pfam_2)) out.close()