def check(text): """Check the text.""" error_code = "example.first" msg = "First line always has an error." reverse(text) return [(1, 1, error_code, msg)]
def find_pair_kmer(input_filename, output_filename): m={} with open(input_filename, "r") as f: count, count2 = 0,0 for line in f: #if count >100: #break words = line.strip().split() l = len(words[0]) newtemp = tools.reverse(words[0]) # these two lines assert newtemp >= words[0] # can be deleted key= words[0][:l/2] + words[0][l/2+1:] if key not in m: m[key] = [] m[key].append( (words[0], words[1]) ) count += 1 print "unique kmer number", count print "not small kmer (compare reverse kmer)", count2 print "total number possible pair kmer", len(m) pairKmers = [] count1, countLarge2 =0, 0 for key in m: if len(m[key]) == 1: count1 += 1 elif len(m[key]) == 2: k1 = m[key][0][0] k2 = m[key][1][0] newk1 = tools.reverse(k1) # these three line newk2 = tools.reverse(k2) # can be deleted assert k1 <= newk1 and k2 <= newk2 #min_k = min(k1, k2) #min_new = min(newk1, newk2) #if min_new < min_k: #k1 = newk1 #k2 = newk2 #sum_coverage = int(m[key][0][1]) + int (m[key][1][1]) if k1 < k2: pairKmers.append( (k1, k2, m[key][0][1], m[key][1][1]) ) else: pairKmers.append( (k2, k1, m[key][1][1], m[key][0][1]) ) else: countLarge2 += 1 print "kmer cannot find pair number", count1 print "more than one mutation in middle", countLarge2 fout = open(output_filename, "w") sortedKmers = sorted(pairKmers) for (k1, k2, c1, c2) in sortedKmers: fout.write("%s %s %s %s\n" % ( k1, k2, c1, c2 ) ) fout.close()
def extend_to_right(h1, left_index, right_index, k): #print (type(k)) mid = int(k/2) key = h1[-(k-1):] Rkey = tools.reverse(key) temp, Rtemp = h1, tools.reverse(h1) add, Radd = "", "" for i in range(0, mid): flag, flagR = False, False if key in left_index: temp = temp + left_index[key][0] Rtemp = tools.reverse(temp) flag = True if Rkey in right_index: Rtemp = right_index[Rkey][0] + Rtemp temp = tools.reverse(Rtemp) flagR = True if flag == True and flagR == False: add = add + left_index[key][0] Radd = tools.reverse(add) key = temp[-(k-1):] Rkey = tools.reverse(key) elif flag == False and flagR == True: Radd = right_index[Rkey][0] + Radd add = tools.reverse(Radd) Rkey = Rtemp[:(k-1)] key = tools.reverse(Rkey) elif flag == flagR: if flag == True: temp = temp[:-1] Rtemp = Rtemp[1:] break return temp, add
def find_non_pair_kmer(uniqKmer, k): mid = int(k/2) left = {} for (kmer, cov) in uniqKmer: leftKey = kmer[:mid] if leftKey not in left: left[ leftKey ] = [] left[leftKey].append( (kmer, cov) ) Rkmer = tools.reverse(kmer) leftKey = Rkmer[:mid] if leftKey not in left: left[ leftKey ] = [] left[leftKey].append( (Rkmer,cov) ) #build map: overlap is key mapMerge = build_map_merge(left, k) nonPair = merge_pair(mapMerge) ''' fout = open("non_pair", "w") sortedKmers = sorted(nonPair) ID = 1 for (k1, k2, c1, c2, c3, c4) in sortedKmers: fout.write(">kmer_non%s_1_cov_%s_cov_%s\n" % (ID, c1, c3)) fout.write("%s\n" % ( k1 ) ) fout.write(">kmer_non%s_2_cov_%s_cov_%s\n" % (ID, c2, c4)) fout.write("%s\n" % ( k2 ) ) ID += 1 fout.close() ''' return nonPair
def get_FP_position(k1, k2): refFilename = "/media/yanbo/Data/reference/hg37/chr22.fa" record = SeqIO.read(open(refFilename), "fasta") print(record.id) seq = str(record.seq).upper() #count = 0 #for (k1, k2, c1, c2) in pairFP: Rk1 = tools.reverse(k1) Rk2 = tools.reverse(k2) a, b = seq.count(k1), seq.count(Rk1) if a > 0: print(k1, k2, a, seq.index(k1)) if b > 0: print(Rk1, Rk2, b, seq.index(Rk1)) c, d = seq.count(k2), seq.count(Rk2) if c > 0: print(k2, k1, c, seq.index(k2)) if d > 0: print(Rk2, Rk1, d, seq.index(Rk2))
def update_map_merge(left_i, left_j, key1, key2, mapMerge): k1, cov1 = left_i k2, cov2 = left_j minkey1, minkey2 = tools.get_smaller_pair_kmer(key1, key2) if (minkey1, minkey2) not in mapMerge: mapMerge[ (minkey1, minkey2) ] = [] Rk1, Rk2 = tools.reverse(k1), tools.reverse(k2) if k1.count(minkey1) == 1 and k2.count(minkey2) == 1: mapMerge[ (minkey1, minkey2) ].append( (k1, k2, cov1, cov2) ) elif k1.count(minkey2) == 1 and k2.count(minkey1) == 1: mapMerge[ (minkey1, minkey2) ].append( (k2, k1, cov2, cov1) ) elif Rk1.count(minkey1) == 1 and Rk2.count(minkey2) == 1: mapMerge[ (minkey1, minkey2) ].append( (Rk1, Rk2, cov1, cov2) ) elif Rk1.count(minkey2) == 1 and Rk2.count(minkey1) == 1: mapMerge[ (minkey1, minkey2) ].append( (Rk2, Rk1, cov2, cov1) ) else: print ("something wrong 1") sys.exit() return
def Reverse(self, request, context): """ running the Reverse object declared in the proto file """ original_string = request.request_string reversed_string = reverse(original_string) return pb.ResponseString( original_string=original_string, reversed_string=reversed_string )
def pick_smaller_unique_kmer(input_filename, low, high, output_filename): fout = open(output_filename, "w") with open(input_filename, "r") as f: for line in f: words = line.strip().split() coverage = int(words[1]) if coverage < low or coverage > high: continue kmer = words[0] newkmer = tools.reverse(kmer) if kmer > newkmer: kmer = newkmer fout.write("%s %s\n" % (kmer, words[1])) fout.close()
def pick_smaller_unique_kmer(input_filename, low, high): uniqKmer = [] with open(input_filename, "r") as f: for line in f: words = line.strip().split() coverage = int(words[1]) if coverage < low or coverage > high: continue kmer = words[0] newkmer = tools.reverse(kmer) if kmer > newkmer: kmer = newkmer uniqKmer.append( (kmer, coverage) ) return uniqKmer
def problem004c(): """Just count down till we find a palindrome because we know 100001 (smallest palindrome) < P < 999*999 """ output = 999*999 while output > 100001: if output == reverse(output): #There must be a cleaner way to perform this test test = [output / x for x in range(100,1000) if output/x > 100 and output/x < 999 and float(output)/x == output/x] if len(test): print output break if output == 906609: break output -= 1 return output
def find_snp_pair_kmer(uniqKmers, k): m={} mid = int(k/2) print ("before build mapK") for (kmer, cov) in uniqKmers: Rkmer = tools.reverse(kmer) # these two lines assert Rkmer >= kmer # can be deleted key= kmer[:mid] + kmer[mid+1:] if key not in m: m[key] = [] m[key].append( (kmer, cov) ) print ("after build mapK") print ("unique kmer number", len(uniqKmers)) print ("total number possible pair kmer", len(m)) pairKmers = [] count1, countLarge2 =0, 0 for key in m: mKeyLen = len(m[key]) if mKeyLen == 1: count1 += 1 ''' else: onePair = [] for (kmer, c) in m[key]: onePair.append( (kmer,c) ) pairKmers.append(onePair) ''' elif len(m[key]) == 2: k1 = m[key][0][0] k2 = m[key][1][0] if k1 < k2: pairKmers.append( (k1, k2, m[key][0][1], m[key][1][1]) ) else: pairKmers.append( (k2, k1, m[key][1][1], m[key][0][1]) ) else: countLarge2 += 1
def run(self): state = 0 fout = open("part_matrix_" + str(self.thread_id), "w") count = 0 for text in self.temp_list: #columns = text.split('\001') if state == 0 and text.startswith("@"): readID = text.strip()[1:] state = 1 count += 1 if count % 1000 == 0: print("thread ", self.thread_id, "deal reads ", count) elif state == 1: #print readID state = 0 seq = text.strip() seqLen = len(seq) intersection = [] for i in range(seqLen - 21): key = str(seq[i:i + 21]) Rkey = tools.reverse(key) Rflag = False if key > Rkey: key = Rkey Rflag = True re = binarySearch(NGS, key) if re != -1: if Rflag == True: #print "1" temp = re[1][:-2] + tools.reverse_ward( re[1][-2] ) + re[1][ -1] # Rkmer direction is opposite with reads intersection.append( (temp, i)) # i is position in reads else: #print "2" intersection.append( (re[1], i)) # i is position in reads #print intersection ''' Rseq = tools.reverse(seq) intersection = [] for i in range(seqLen-21): key = str(Rseq[i:i+21]) re = binarySearch(NGS, key) if re != -1: intersection.append( re[1] ) print intersection ''' if len(intersection) > 0: print intersection PosList = decide_kmer(intersection) #if count == 10: #sys.exit() if len(PosList) <= 1: continue fout.write("%s %s " % (len(PosList), readID)) for (p, binary, pos) in PosList: fout.write("%s %s %s " % (p, binary, pos)) score = len(PosList) * '4' fout.write("%s\n" % score) #print len(seq), len(kmers) #sys.exit() else: continue fout.close()
# File Name: fastq2Reversefasta.py # Author: Yanbo Li # mail: [email protected] # Created Time: Mon 29 Jul 2019 17:02:56 AEST ######################################################################### #!/bin/bash import os import sys from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord import tools inFile = sys.argv[1] outFile = sys.argv[2] #records = [] fout = open(outFile, "w") for record in SeqIO.parse(inFile, "fastq"): print(record.id) if record.seq.count('N') > 0: continue rec1 = SeqRecord(record.seq, id=record.id + "_1") rec2 = SeqRecord(tools.reverse(record.seq), id=record.id + "_2") #records.append(rec1) #records.append(rec2) fout.write(">%s\n" % rec1.id) fout.write("%s\n" % rec1.seq) fout.write(">%s\n" % rec2.id) fout.write("%s\n" % rec2.seq) #SeqIO.write(records, sys.argv[2], "fasta") fout.close()
def find_pair_indel_kmer(kmerFile, k_1merFile, output_filename): kmers = {} count = 0 with open(kmerFile, "r") as f: for line in f: words = line.strip().split() l = len(words[0]) newtemp = tools.reverse(words[0]) # these two lines assert newtemp >= words[0] # can be deleted mid = int(l / 2) leftHalf = words[0][:mid] rightHalf = words[0][mid + 1:] if (tools.hamming_distance(rightHalf, words[0][mid:-1]) <= 1 or # mutation => delete tools.hamming_distance(leftHalf, words[0][1:mid + 1]) <= 1): continue #if words[0][mid] == words[0][mid-1] and words[0][mid] == words[0][mid+1] : # only keep those pair, delete at first position or last position #if words[0][mid] == words[0][mid-1]: # only keep those pair, delete at first position or last position #continue key = leftHalf + rightHalf Rkey = tools.reverse(key) assert key <= Rkey if key not in kmers: kmers[key] = [] kmers[key].append((words[0], words[1])) count += 1 print("unique kmer number", count) print("unique k_1mer in kmers", len(kmers)) k_1mers = {} count = 0 with open(k_1merFile, "r") as f: for line in f: words = line.strip().split() l = len(words[0]) key = words[0] newtemp = tools.reverse(key) # these two lines assert newtemp >= key # can be deleted assert key not in k_1mers k_1mers[key] = words[1] # words[1] is coverage count += 1 print("unique k_1mer number", count) pairKmers = [] for key in k_1mers: if key in kmers: if len(kmers[key]) == 1: #print key, k_1mers[key], kmers[key] kmer, c = kmers[key][0] if kmer[:-1] == key or kmer[ 1:] == key: # remove head or tail, they are same continue pairKmers.append((kmer, key, c, k_1mers[key])) else: print("more than 1", key, k_1mers[key], kmers[key]) fout = open(output_filename, "w") sortedKmers = sorted(pairKmers) countU, countD, count = 0, 0, 0 for (k1, k2, c1, c2) in sortedKmers: #print (mid) if k1[mid] == k1[mid + 1] or k1[mid] == k1[mid - 1]: countD += 1 else: countU += 1 count += 1 fout.write("%s %s %s %s\n" % (k1, k2, c1, c2)) fout.close() print(countU, countD, count)
def shift_kmer(h, left_unique_index, right_unique_index, label): # last para forward string or backward string group = [] #Rgroup = [] temp = h length = len(h) / 2 ######################### #x: AAAATC....TTATT #y: AAATC....TTATT # x<x' but y>y' #x': AATA ...TTTT #y': AAATA ...TTT #if unique kmer only store one samller kmer this code wrong #unique kmer now store forward and bacward string to use this code #we later improve this #now code is suit for only store one smaller kmer ################### if label == "l": key = temp[1:] Rkey = tools.reverse(key) #print "ls", h, key, Rkey for i in range(0, length): flag, flagR = False, False if key in left_unique_index and len(left_unique_index[key]) == 1: temp = key + left_unique_index[key][0] flag = True if Rkey in right_unique_index and len( right_unique_index[Rkey]) == 1: Rtemp = right_unique_index[Rkey][0] + Rkey flagR = True if flag == True and flagR == True: #print "shift stop, size of group", len(group) if len(group) >= 5: print "shift stop", key, temp, "orginal", h print "forward and backward both have next" print Rkey, Rtemp break #sys.exit() elif flag == True and flagR == False: key = temp[1:] Rkey = tools.reverse(key) group.append((temp, 'f')) #print i, temp, key, Rkey elif flag == False and flagR == True: Rkey = Rtemp[:-1] key = tools.reverse(Rkey) #ward = tool.reverse_ward(ward) group.append((Rtemp, 'b')) #print i, Rtemp, key, Rkey else: #print "shift stop, size of group", len(group) if len(group) >= 5: print "shift stop", key, Rkey, "orginal", h print key in left_unique_index, Rkey in right_unique_index break #print "le", group if label == "r": key = temp[:-1] Rkey = tools.reverse(key) #print "rs", h, key, Rkey for i in range(0, length): flag, flagR = False, False if key in right_unique_index and len(right_unique_index[key]) == 1: temp = right_unique_index[key][0] + key flag = True if Rkey in left_unique_index and len(left_unique_index[Rkey]) == 1: Rtemp = Rkey + left_unique_index[Rkey][0] flagR = True if flag == True and flagR == True: #print "rr", key, temp #print Rkey, Rtemp break elif flag == True and flagR == False: key = temp[:-1] Rkey = tools.reverse(key) group.append((temp, 'f')) #print i, temp, key, Rkey elif flag == False and flagR == True: Rkey = Rtemp[1:] key = tools.reverse(Rkey) #ward = tool.reverse_ward(ward) group.append((Rtemp, 'b')) #print i, Rtemp, key, Rkey else: break #print "re", group ''' if label == "r": for i in range(0, length): key = temp[:-1] if key in unique_index and len(right_unique_index[key]) == 1: temp = right_unique_index[key][0] + key group.append( temp ) else: break ''' assert len(group) <= len(h) return group
def group_shift_kmer(pair_filename, unique_filename, output_filename, NGS_kmer): left_unique_index, right_unique_index = read_unique_kmer(unique_filename) fout = open(output_filename, "w") count = 0 kmers = {} with open(pair_filename, "r") as f: for line in f: words = line.strip().split() h1 = words[0] h2 = words[1] leftGroup1 = shift_kmer(h1, left_unique_index, right_unique_index, "l") leftGroup2 = shift_kmer(h2, left_unique_index, right_unique_index, "l") cov1 = int(words[2]) cov2 = int(words[3]) sum_coverage = cov1 + cov2 Flag = True i = 0 lenG = min(len(leftGroup1), len(leftGroup2)) while i < lenG: if (tools.hamming_distance(leftGroup1[i][0], leftGroup2[i][0]) != 1 and tools.hamming_distance( leftGroup1[i][0], tools.reverse( leftGroup2[i][0])) != 1): Flag = False break i += 1 ''' # 60 equal to lowest coverage if cov1 - cov2 >= 60 or cov2 - cov1 >= 60: # or float(cov2/cov1) >=1.7 or float(cov1/cov2) >=1.7: continue if sum_coverage <= 180 or sum_coverage >= 280: continue ''' if Flag == False: continue rightGroup1 = shift_kmer(h1, left_unique_index, right_unique_index, "r") rightGroup2 = shift_kmer(h2, left_unique_index, right_unique_index, "r") i = 0 lenG = min(len(rightGroup1), len(rightGroup2)) while i < lenG: if (tools.hamming_distance(rightGroup1[i][0], rightGroup2[i][0]) != 1 and tools.hamming_distance( rightGroup1[i][0], tools.reverse( rightGroup2[i][0])) != 1): Flag = False break i += 1 if Flag == False: continue group1, group2 = [(h1, 'f')], [(h2, 'f')] group1.extend(leftGroup1) group2.extend(leftGroup2) group1.extend(rightGroup1) group2.extend(rightGroup2) gSize1 = len(group1) gSize2 = len(group2) if gSize1 <= kmerSize - 1 or gSize2 <= kmerSize - 1: #if gSize1 <=kmerSize/2 or gSize2 <= kmerSize/2: continue count += 1 fout.write("group %s %s %s %s %s\n" % (count, h1, h2, words[2], words[3])) print "group", count, len(leftGroup1), len(rightGroup1), len( leftGroup2), len(rightGroup2) for ele in group1: fout.write("%s %s " % (ele[0], ele[1])) if ele[0] not in kmers: kmers[ele[0]] = [] kmers[ele[0]].append(str(count) + ele[1] + 'A') # A is zore fout.write("\n") for ele in group2: fout.write("%s %s " % (ele[0], ele[1])) if ele[0] not in kmers: kmers[ele[0]] = [] kmers[ele[0]].append(str(count) + ele[1] + 'B') # B is one fout.write("\n") fout.close() fout = open(NGS_kmer, "w") print "total group number", count sortedKmers = sorted(kmers.items()) filterGroup = set() for ele in sortedKmers: if len(ele[1]) >= 2: print ele for ID in ele[1]: filterGroup.add(ID[:-2]) continue fout.write("%s" % ele[0]) l = ele[1] for e in l: fout.write(" %s" % e) fout.write("\n") fout.close() print "filter group size", len(filterGroup) foutFilter = open(filter_filename, "w") with open(output_filename, "r") as f: state = 0 for line in f: if state == 0 and line.startswith("group"): words = line.split() if words[1] not in filterGroup: foutFilter.write(line) state = 1 else: state = -1 elif state == 1: foutFilter.write(line) state = 2 elif state == 2: foutFilter.write(line) state = 0 elif state == -1: state = 0