示例#1
0
def Extract(infa, results, outfa):
    '''Extract lncRNA sequence'''

    SeqID, SeqList = GetFasta(infa)

    try:
        fr = open(results, 'rU')    # results file
    except (IOError,ValueError) as e:
        print >>sys.stderr, str(e)
        sys.exit(1)

    idlist = set()
    for line in fr.readlines()[1:]:
        line = line.strip()
        if line.split("\t")[2] == "Noncoding":
            idlist.add( line.split("\t")[0] )
    fr.close()

    try:
        fo = open(outfa, 'w')    # output file
    except (IOError,ValueError) as e:
        print >>sys.stderr, str(e)
        sys.exit(1)

    for seqid, seq in zip(SeqID, SeqList):
        if seqid in idlist:
            fo.write(">" + seqid + "\n")
            fo.write(seq + "\n")
    fo.close()
def Protein_StructureScore(protein_file, out_prefix=None):
    '''to compute structure features of rnas and proteins'''

    protein_out = os.path.join(out_prefix, "protein_score")

    #####################
    # protein structure
    protein_file_part = os.path.join(out_prefix, "tmp.protein.file.")
    protein_file_list = os.path.join(out_prefix, "tmp.filelist")

    exedir = os.path.dirname(os.path.abspath(__file__))
    stride_dat = os.path.join(exedir, "../src/stride.dat")
    stride_cmd = "cp " + stride_dat + " " + os.path.abspath('.')
    tmp_stride_dat = os.path.join(os.path.abspath('.'), "stride.dat")
    subprocess.call(stride_cmd, shell=True)

    proID, proSeq = GetFasta(protein_file)
    i = 0
    for proid, proseq in zip(proID, proSeq):
        f_tmp = open(protein_file_part + str(i), "w")
        i += 1
        f_tmp.write(">" + proid + "\n")
        f_tmp.write(proseq + "\n")
        f_tmp.close()

    file_list_cmd = "ls " + out_prefix + \
        " |grep tmp.protein.file > " + protein_file_list
    #print file_list_cmd
    subprocess.call(file_list_cmd, shell=True)

    exedir = os.path.dirname(os.path.abspath(__file__))
    RNAScore2 = os.path.join(exedir, "../src/RNAScore2")

    with open(protein_file_list, "rU") as fp:
        for tmp in fp.readlines():
            tmp = tmp.strip()
            tmpfile = os.path.join(out_prefix, tmp)
            tmpout = os.path.join(out_prefix, tmp + ".pro_score")
            protein_cmd = RNAScore2 + " -i " + tmpfile + " -o " + tmpout + " -p"
            #print protein_cmd
            subprocess.call(protein_cmd, shell=True)

            combine_cmd = "cat " + tmpout + " >> " + protein_out
            #print combine_cmd
            subprocess.call(combine_cmd, shell=True)
            os.remove(tmpfile)
            os.remove(tmpout)

        fp.close()
    #####################

    os.remove(protein_file_list)
    os.remove(tmp_stride_dat)

    return protein_out
def GeneratePairs(rna_file, pro_file, out_prefix):
    '''generate all rna_protein pairs'''

    rnaID = GetFasta(rna_file)[0]
    proID = GetFasta(pro_file)[0]

    # generate can
    pair_file_list = []
    rna_id_list = []

    for rnaid in rnaID:
        tmp_pair = out_prefix + "." + rnaid.split("|")[0]
        with open(tmp_pair, "w") as fo:
            for proid in proID:
                fo.write(rnaid + " " + proid + "\n")
            pair_file_list.append(tmp_pair)
            rna_id_list.append(rnaid)
        fo.close()

    return [rna_id_list, pair_file_list]
def GenEDPfeature(rna_file, pro_file, logscore_dict=None):
    '''generate rna, protein features'''

    rna_fea_1 = {}
    rna_fea_2 = {}
    pro_fea = {}

    rna_ID, rna_Seq = GetFasta(rna_file)
    pro_ID, pro_Seq = GetFasta(pro_file)

    for rna_id, rna_seq in zip(rna_ID, rna_Seq):
        nn_edp_fea, rna_lncfea = GetRNAfea(rna_seq, logscore_dict)
        rna_fea_1[rna_id] = nn_edp_fea
        rna_fea_2[rna_id] = rna_lncfea

    for pro_id, pro_seq in zip(pro_ID, pro_Seq):
        aa_edp_fea = GetPROfea(pro_seq)
        pro_fea[pro_id] = aa_edp_fea

    return [rna_fea_1, rna_fea_2, pro_fea]
示例#5
0
def ReadProtein(pro_fa):
    '''Read proteins fasta'''

    SeqID, SeqList = GetFasta(pro_fa)

    # lncRNA sequence
    protein_dict = {}

    for seqid, seq in zip(SeqID, SeqList):
        protein_dict[seqid] = seq

    return protein_dict
示例#6
0
def ReadLncRNA(lncRNA_fa):
    '''Read lncRNAs fasta'''

    SeqID, SeqList = GetFasta(lncRNA_fa)

    # lncRNA sequence
    lncRNA_dict = {}

    intap = "U"
    outap = "T"
    transtap = maketrans(intap, outap)

    for seqid, seq in zip(SeqID, SeqList):
        lncRNA_dict[seqid] = seq.translate(transtap)

    return lncRNA_dict
def RNA_StructureScore(rna_file, out_prefix=None):
    '''to compute structure features of rnas and proteins'''

    rna_out = os.path.join(out_prefix, "rna_score")

    #####################
    # lncRNA structure
    rna_file_part = os.path.join(out_prefix, "tmp.rna.file.")
    rna_file_list = os.path.join(out_prefix, "tmp.filelist")

    rnaID, rnaSeq = GetFasta(rna_file)
    i = 0
    for rnaid, rnaseq in zip(rnaID, rnaSeq):
        f_tmp = open(rna_file_part + str(i), "w")
        i += 1
        f_tmp.write(">" + rnaid + "\n")
        f_tmp.write(rnaseq + "\n")
        f_tmp.close()

    file_list_cmd = "ls " + out_prefix + " |grep tmp.rna.file > " + rna_file_list
    #print file_list_cmd
    subprocess.call(file_list_cmd, shell=True)

    exedir = os.path.dirname(os.path.abspath(__file__))
    RNAScore2 = os.path.join(exedir, "../src/RNAScore2")

    with open(rna_file_list, "rU") as fr:
        for tmp in fr.readlines():
            tmp = tmp.strip()
            tmpfile = os.path.join(out_prefix, tmp)
            tmpout = os.path.join(out_prefix, tmp + ".r_score")
            rna_cmd = RNAScore2 + " -i " + tmpfile + " -o " + tmpout + " -l 250 -r"
            #print rna_cmd
            subprocess.call(rna_cmd, shell=True)

            combine_cmd = "cat " + tmpout + " >> " + rna_out
            #print combine_cmd
            subprocess.call(combine_cmd, shell=True)
            os.remove(tmpfile)
            os.remove(tmpout)
        fr.close()
    #####################

    os.remove(rna_file_list)

    return rna_out
示例#8
0
def GenerateTrans(fasta, outfile):
    '''generate translated fasta file'''

    try:
        f = open(outfile, "w")
    except (IOError, ValueError) as e:
        print >> sys.stderr, str(e)
        sys.exit(1)

    SeqID, SeqList = GetFasta(fasta)

    #print "Translate to AA"

    for seqid, seq in zip(SeqID, SeqList):
        tmp_protein_list = SixFrame(seq, direction=1)
        for tmp_protein in tmp_protein_list:
            f.write("".join([">", seqid]) + "\n")
            f.write(tmp_protein + "\n")

    f.close()
def GenAnnoEDPfeature(rna_file, pro_file, logscore_dict=None):
    '''generate rna, protein features'''

    rna_fea_1 = {}
    rna_fea_2 = {}
    pro_fea = {}

    rna_ID, rna_Seq = GetFasta(rna_file)

    for rna_id, rna_seq in zip(rna_ID, rna_Seq):
        nn_edp_fea, rna_lncfea = GetRNAfea(rna_seq, logscore_dict)
        rna_fea_1[rna_id] = nn_edp_fea
        rna_fea_2[rna_id] = rna_lncfea

    exedir = os.path.dirname(os.path.abspath(__file__))
    pro_fea_file = os.path.join(exedir,
                                "../src/Swiss-Uniprot.human.protein.seq_fea")
    fp1 = open(pro_fea_file, "rU")
    for line in fp1.readlines():
        line = line.strip()
        pro_fea[line.split()[0]] = "\t".join(line.split()[1:])
    fp1.close()

    return [rna_fea_1, rna_fea_2, pro_fea]
示例#10
0
def HexamerFrequency(fasta):
    '''count the hexamer usage as features'''

    HexamerCount = {}
    for k in _6mer_list:
        HexamerCount[k] = 0.0

    SeqID, SeqList = GetFasta(fasta)

    totalcount = 0.0
    for seq in SeqList:
        ORF = GetORF(seq)
        if (len(ORF) > 3):
            num = len(ORF) / 3
            for i in range(0, num - 1):
                totalcount += 1.0
                tmp = ORF[i * 3:(i + 2) * 3]
                if HexamerCount.has_key(tmp):
                    HexamerCount[tmp] += 1.0

    for k, v in HexamerCount.items():
        HexamerCount[k] = v / totalcount

    return HexamerCount