def output(pas_array,scan_file,out,window,max_shift,species,prob,number_pas,rst): extend = int(window/2) f = open(scan_file,'r') lines = f.readlines() ww = open(out,'w') pre_pos = 0 negative_candidate = dict() for i,line in enumerate(lines): line = line.rstrip('\n') pas_id,rpm,base = line.split('\t') chromosome,pos,strand = pas_id.split(':') pos = int(pos) if(i-extend>0 and i+extend+1+max_shift<len(lines)): if(random.random()<prob): accept = 1 if (abs(pos-pre_pos)<Threshold): continue for true_pos in pas_array: if(abs(pos-true_pos)<Threshold): accept = 0 if(accept==0): continue pre_pos = pos negative_candidate[pas_id] = i count = 0 items = list(negative_candidate.items()) random.shuffle(items) for pas_id,i in items: start = i-extend end = i+extend if(not check(lines[start-max_shift],lines[end+max_shift],window+2*max_shift)): continue success = collpase(pas_id,'unknown','unknown',lines[start:end+1],ww,species,0,rst) count += success if(success==0): continue for j in range(-max_shift,max_shift+1): if(j==0): continue k = i+j start = k-extend end = k+extend if(start>0 and end+1<len(lines)): if(check(lines[start],lines[end],window)): collpase(pas_id,'unknown','unknown',lines[start:end+1],ww,species,j,rst) if(count>=number_pas): break if(count<number_pas): raise Warning("not engough negative candidates, please incerase the probability for selecting!") else: print("successfully randomly get same number of negative pas as ground truth") f.close()
def output(pas_dict,scan_file,out,window,max_shift,species): extend = int(window/2) f = open(scan_file,'r') lines = f.readlines() ww = open(out,'w') for i,line in enumerate(lines): line = line.rstrip('\n') pas_id,rpm,base = line.split('\t') chromosome,pos,strand = pas_id.split(':') pos = int(pos) if pos in pas_dict.keys(): pas_type = pas_dict[pos] symbol = 'unknown' if(i-extend>0 and i+extend+1<len(lines)): for j in range(-max_shift,max_shift+1): k = i+j start = k-extend end = k+extend if(start>0 and end+1<len(lines)): if(check(lines[start],lines[end],window)): collpase(pas_id,pas_type,symbol,lines[start:end+1],ww,species,j) ww.close() f.close()
def dataProcessing(scan_file, window, rst): extend = int(window / 2) data1 = [] data2 = [] PASID = [] alphabet = np.array(['A', 'T', 'C', 'G']) f = open(scan_file, 'r') lines = f.readlines() #n_pos = 0 #position containing N for i, line in enumerate(lines): line = line.rstrip('\n') pas_id, _, base = line.split('\t') if (base == 'N'): continue start = i - extend end = i + extend if (start > 0 and end + 1 < len(lines)): if (not check(lines[start], lines[end], window)): continue sequence, coverage = collpase(pas_id, lines[start:end + 1], rst) if (sequence != 0): chromosome, pos, strand = pas_id.split(':') sequence = list(sequence) seq = np.array(sequence, dtype='|U1').reshape(-1, 1) seq_data = (seq == alphabet).astype(np.float32) data1.append(seq_data) coverage = np.array(coverage).astype(np.float32) data2.append(coverage) PASID.append(pas_id) data1 = np.stack(data1).reshape([-1, window, 4]) data2 = np.stack(data2).reshape([-1, window, 1]) PASID = np.array(PASID) f.close() return data1, data2, PASID