def extract_feature_from_seq(seq,stt,stp,c_tab,g_tab): '''extract features of sequence from fasta entry''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = maketrans("ACGTNX","TGCANX") mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = orf.ORFFinder(mRNA_seq) (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+",start_coden=stt_coden, stop_coden=stp_coden) fickett_score1 = fickett.fickett_value(CDS_seq1) hexamer = FrameKmer.kmer_ratio(CDS_seq1,6,3,c_tab,g_tab) return (mRNA_size, CDS_size1, fickett_score1,hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab): '''extract features of sequence from bed line''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = str.maketrans("ACGTNX", "TGCANX") mRNA_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_num = int(fields[9]) exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st + 1) + '-' + str(end) tmp = pysam.faidx(refgenome, exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp = orf.ORFFinder(mRNA_seq) (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) #print CDS_seq return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def mainProcess(input, output, number, c_tab, g_tab, codonArr, hash_matrix, classifier): if number > 1: Temp_Dir = output + '_Tmp_Dir' temp_score = '' + Temp_Dir + '/' + output + str(number) # temp_feature = ''+Temp_Dir+'/temp_feature' + str(number) SCORE = open(temp_score, 'w') # DATA = open(temp_feature,'w') sequence_Arr = input.split('\n') sLen = len(sequence_Arr) - 1 del sequence_Arr[sLen] if number == 1: SCORE = open(output, 'w') sequence_Arr = input label_Arr_tmp = [] FastA_seq_Arr_tmp = [] for n in range(len(sequence_Arr)): if n == 0 or n % 2 == 0: label = sequence_Arr[n] label_Arr_tmp.append(label) else: seq = sequence_Arr[n] FastA_seq_Arr_tmp.append(seq) data = [] ids = [] for i in range(len(label_Arr_tmp)): Seq = FastA_seq_Arr_tmp[i] tran_fir_seq = Seq.lower() tran_sec_seq_one = tran_fir_seq.replace('u', 't') strinfo = re.compile('[^agctn]') tran_sec_seq = strinfo.sub('n', tran_sec_seq_one) tran_sec_seq2 = tran_sec_seq.upper() tmp = orf.ORFFinder(tran_sec_seq2) (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+") (MCS, CSL, CP) = mcssProcess(tran_sec_seq2, c_tab, g_tab) fickett_score = fickett.fickett_value(CDS_seq) (orfscore, orfdistance) = HexamerFeatures(CDS_seq.lower(), hash_matrix) labels_Arr = label_Arr_tmp[i].split() ids.append(labels_Arr[0]) Exons_mscore = [] Exons_distance = [] Exons_GC = [] Site_start = 0 for j in range(1, len(labels_Arr)): seq = tran_sec_seq[Site_start:Site_start + int(labels_Arr[j])] if (len(seq) > 0): GCnum = seq.count('c') + seq.count('g') GCratio = GCnum / float(len(seq)) Exons_GC.append(GCratio) (mscore, distance) = HexamerFeatures(seq, hash_matrix) Exons_mscore.append(mscore) Exons_distance.append(distance) Site_start = Site_start + int(labels_Arr[j]) else: continue Max_Mscore_exon = max(Exons_mscore) Max_distance = max(Exons_distance) Max_GCcontent = max(Exons_GC) full_len = len(tran_sec_seq) orf_ratio = CDS_size / float(full_len) transcript_features = [ CDS_size, orf_ratio, fickett_score, orfscore, orfdistance, Max_Mscore_exon, Max_distance, Max_GCcontent, MCS, CSL, CP ] data.append(transcript_features) # PROPERTY_STR = labels_Arr[0] + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' # DATA.write(PROPERTY_STR) testing_data = np.array(data) del data testing_data = testing_data.reshape(len(label_Arr_tmp), 11) prob = classifier.predict_proba(testing_data) labels = classifier.predict(testing_data) PrintResult(ids, labels, prob[:, 1], SCORE) SCORE.close()
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,mRNA_num): mRNA_num = mRNA_num/2 if number > 1: Temp_Dir = output + '_Tmp_Dir' temp_feature = ''+Temp_Dir+'/'+ output + str(number) DATA = open(temp_feature,'w') sequence_Arr = input.split('\n') sLen = len(sequence_Arr) - 1 del sequence_Arr[sLen] if number == 1: DATA = open(output,'w') sequence_Arr = input label_Arr_tmp = [] FastA_seq_Arr_tmp = [] for n in range(len(sequence_Arr)): if n == 0 or n % 2 == 0: label = sequence_Arr[n] label_Arr_tmp.append(label) else : seq = sequence_Arr[n] FastA_seq_Arr_tmp.append(seq) for i in range(len(label_Arr_tmp)): Seq = FastA_seq_Arr_tmp[i] tran_fir_seq = Seq.lower() tran_sec_seq_one = tran_fir_seq.replace('u','t') strinfo = re.compile('[^agctn]') tran_sec_seq = strinfo.sub('n',tran_sec_seq_one) tran_sec_seq2 = tran_sec_seq.upper() tmp = orf.ORFFinder(tran_sec_seq2) (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+") (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab) fickett_score = fickett.fickett_value(CDS_seq) (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix) labels_Arr = label_Arr_tmp[i].split() Exons_mscore = [] Exons_distance =[] Exons_GC = [] Site_start = 0 for j in range(1,len(labels_Arr)): seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])] if (len(seq) > 0): GCnum = seq.count('c') + seq.count('g') GCratio = GCnum/float(len(seq)) Exons_GC.append(GCratio) (mscore,distance) = HexamerFeatures(seq,hash_matrix) Exons_mscore.append(mscore) Exons_distance.append(distance) Site_start = Site_start + int(labels_Arr[j]) else: continue Max_Mscore_exon = max(Exons_mscore) Max_distance = max(Exons_distance) Max_GCcontent = max(Exons_GC) full_len = len(tran_sec_seq) orf_ratio = CDS_size/float(full_len) if i < mRNA_num: PROPERTY_STR = '+1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' else: PROPERTY_STR = '-1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' DATA.write(PROPERTY_STR) DATA.close()