def Extract(infa, results, outfa): '''Extract lncRNA sequence''' SeqID, SeqList = GetFasta(infa) try: fr = open(results, 'rU') # results file except (IOError,ValueError) as e: print >>sys.stderr, str(e) sys.exit(1) idlist = set() for line in fr.readlines()[1:]: line = line.strip() if line.split("\t")[2] == "Noncoding": idlist.add( line.split("\t")[0] ) fr.close() try: fo = open(outfa, 'w') # output file except (IOError,ValueError) as e: print >>sys.stderr, str(e) sys.exit(1) for seqid, seq in zip(SeqID, SeqList): if seqid in idlist: fo.write(">" + seqid + "\n") fo.write(seq + "\n") fo.close()
def Protein_StructureScore(protein_file, out_prefix=None): '''to compute structure features of rnas and proteins''' protein_out = os.path.join(out_prefix, "protein_score") ##################### # protein structure protein_file_part = os.path.join(out_prefix, "tmp.protein.file.") protein_file_list = os.path.join(out_prefix, "tmp.filelist") exedir = os.path.dirname(os.path.abspath(__file__)) stride_dat = os.path.join(exedir, "../src/stride.dat") stride_cmd = "cp " + stride_dat + " " + os.path.abspath('.') tmp_stride_dat = os.path.join(os.path.abspath('.'), "stride.dat") subprocess.call(stride_cmd, shell=True) proID, proSeq = GetFasta(protein_file) i = 0 for proid, proseq in zip(proID, proSeq): f_tmp = open(protein_file_part + str(i), "w") i += 1 f_tmp.write(">" + proid + "\n") f_tmp.write(proseq + "\n") f_tmp.close() file_list_cmd = "ls " + out_prefix + \ " |grep tmp.protein.file > " + protein_file_list #print file_list_cmd subprocess.call(file_list_cmd, shell=True) exedir = os.path.dirname(os.path.abspath(__file__)) RNAScore2 = os.path.join(exedir, "../src/RNAScore2") with open(protein_file_list, "rU") as fp: for tmp in fp.readlines(): tmp = tmp.strip() tmpfile = os.path.join(out_prefix, tmp) tmpout = os.path.join(out_prefix, tmp + ".pro_score") protein_cmd = RNAScore2 + " -i " + tmpfile + " -o " + tmpout + " -p" #print protein_cmd subprocess.call(protein_cmd, shell=True) combine_cmd = "cat " + tmpout + " >> " + protein_out #print combine_cmd subprocess.call(combine_cmd, shell=True) os.remove(tmpfile) os.remove(tmpout) fp.close() ##################### os.remove(protein_file_list) os.remove(tmp_stride_dat) return protein_out
def GeneratePairs(rna_file, pro_file, out_prefix): '''generate all rna_protein pairs''' rnaID = GetFasta(rna_file)[0] proID = GetFasta(pro_file)[0] # generate can pair_file_list = [] rna_id_list = [] for rnaid in rnaID: tmp_pair = out_prefix + "." + rnaid.split("|")[0] with open(tmp_pair, "w") as fo: for proid in proID: fo.write(rnaid + " " + proid + "\n") pair_file_list.append(tmp_pair) rna_id_list.append(rnaid) fo.close() return [rna_id_list, pair_file_list]
def GenEDPfeature(rna_file, pro_file, logscore_dict=None): '''generate rna, protein features''' rna_fea_1 = {} rna_fea_2 = {} pro_fea = {} rna_ID, rna_Seq = GetFasta(rna_file) pro_ID, pro_Seq = GetFasta(pro_file) for rna_id, rna_seq in zip(rna_ID, rna_Seq): nn_edp_fea, rna_lncfea = GetRNAfea(rna_seq, logscore_dict) rna_fea_1[rna_id] = nn_edp_fea rna_fea_2[rna_id] = rna_lncfea for pro_id, pro_seq in zip(pro_ID, pro_Seq): aa_edp_fea = GetPROfea(pro_seq) pro_fea[pro_id] = aa_edp_fea return [rna_fea_1, rna_fea_2, pro_fea]
def ReadProtein(pro_fa): '''Read proteins fasta''' SeqID, SeqList = GetFasta(pro_fa) # lncRNA sequence protein_dict = {} for seqid, seq in zip(SeqID, SeqList): protein_dict[seqid] = seq return protein_dict
def ReadLncRNA(lncRNA_fa): '''Read lncRNAs fasta''' SeqID, SeqList = GetFasta(lncRNA_fa) # lncRNA sequence lncRNA_dict = {} intap = "U" outap = "T" transtap = maketrans(intap, outap) for seqid, seq in zip(SeqID, SeqList): lncRNA_dict[seqid] = seq.translate(transtap) return lncRNA_dict
def RNA_StructureScore(rna_file, out_prefix=None): '''to compute structure features of rnas and proteins''' rna_out = os.path.join(out_prefix, "rna_score") ##################### # lncRNA structure rna_file_part = os.path.join(out_prefix, "tmp.rna.file.") rna_file_list = os.path.join(out_prefix, "tmp.filelist") rnaID, rnaSeq = GetFasta(rna_file) i = 0 for rnaid, rnaseq in zip(rnaID, rnaSeq): f_tmp = open(rna_file_part + str(i), "w") i += 1 f_tmp.write(">" + rnaid + "\n") f_tmp.write(rnaseq + "\n") f_tmp.close() file_list_cmd = "ls " + out_prefix + " |grep tmp.rna.file > " + rna_file_list #print file_list_cmd subprocess.call(file_list_cmd, shell=True) exedir = os.path.dirname(os.path.abspath(__file__)) RNAScore2 = os.path.join(exedir, "../src/RNAScore2") with open(rna_file_list, "rU") as fr: for tmp in fr.readlines(): tmp = tmp.strip() tmpfile = os.path.join(out_prefix, tmp) tmpout = os.path.join(out_prefix, tmp + ".r_score") rna_cmd = RNAScore2 + " -i " + tmpfile + " -o " + tmpout + " -l 250 -r" #print rna_cmd subprocess.call(rna_cmd, shell=True) combine_cmd = "cat " + tmpout + " >> " + rna_out #print combine_cmd subprocess.call(combine_cmd, shell=True) os.remove(tmpfile) os.remove(tmpout) fr.close() ##################### os.remove(rna_file_list) return rna_out
def GenerateTrans(fasta, outfile): '''generate translated fasta file''' try: f = open(outfile, "w") except (IOError, ValueError) as e: print >> sys.stderr, str(e) sys.exit(1) SeqID, SeqList = GetFasta(fasta) #print "Translate to AA" for seqid, seq in zip(SeqID, SeqList): tmp_protein_list = SixFrame(seq, direction=1) for tmp_protein in tmp_protein_list: f.write("".join([">", seqid]) + "\n") f.write(tmp_protein + "\n") f.close()
def GenAnnoEDPfeature(rna_file, pro_file, logscore_dict=None): '''generate rna, protein features''' rna_fea_1 = {} rna_fea_2 = {} pro_fea = {} rna_ID, rna_Seq = GetFasta(rna_file) for rna_id, rna_seq in zip(rna_ID, rna_Seq): nn_edp_fea, rna_lncfea = GetRNAfea(rna_seq, logscore_dict) rna_fea_1[rna_id] = nn_edp_fea rna_fea_2[rna_id] = rna_lncfea exedir = os.path.dirname(os.path.abspath(__file__)) pro_fea_file = os.path.join(exedir, "../src/Swiss-Uniprot.human.protein.seq_fea") fp1 = open(pro_fea_file, "rU") for line in fp1.readlines(): line = line.strip() pro_fea[line.split()[0]] = "\t".join(line.split()[1:]) fp1.close() return [rna_fea_1, rna_fea_2, pro_fea]
def HexamerFrequency(fasta): '''count the hexamer usage as features''' HexamerCount = {} for k in _6mer_list: HexamerCount[k] = 0.0 SeqID, SeqList = GetFasta(fasta) totalcount = 0.0 for seq in SeqList: ORF = GetORF(seq) if (len(ORF) > 3): num = len(ORF) / 3 for i in range(0, num - 1): totalcount += 1.0 tmp = ORF[i * 3:(i + 2) * 3] if HexamerCount.has_key(tmp): HexamerCount[tmp] += 1.0 for k, v in HexamerCount.items(): HexamerCount[k] = v / totalcount return HexamerCount