def measure_rbp(entry): import os from time import time from pysster import utils output_folder = entry[4] + "_pysster/" if not os.path.isdir(output_folder): os.makedirs(output_folder) start = time() # predict secondary structures utils.predict_structures(entry[0], entry[0] + ".struct", annotate=True) utils.predict_structures(entry[1], entry[1] + ".struct", annotate=True) utils.predict_structures(entry[2], entry[2] + ".struct", annotate=True) utils.predict_structures(entry[3], entry[3] + ".struct", annotate=True) from pysster.Data import Data from pysster.Model import Model # load data data = Data([entry[0] + ".struct", entry[1] + ".struct"], ("ACGU", "HIMS")) data.train_val_test_split( 0.8, 0.1999 ) # we need to have at least one test sequence, even though we have a separate test object # training params = {"kernel_len": 8} model = Model(params, data) model.train(data) # load and predict test data data_test = Data([entry[2] + ".struct", entry[3] + ".struct"], ("ACGU", "HIMS")) predictions = model.predict(data_test, "all") stop = time() print("{}, time in seconds: {}".format(entry[4], stop - start)) # performance evaluation labels = data_test.get_labels("all") utils.plot_roc(labels, predictions, output_folder + "roc.pdf") utils.plot_prec_recall(labels, predictions, output_folder + "prec.pdf") # get motifs activations = model.get_max_activations(data_test, "all") _ = model.visualize_all_kernels(activations, data_test, output_folder) # save model to drive utils.save_model(model, "{}model.pkl".format(output_folder))
def test_utils_predict_structures(self): # RNAfold and RNAlib bindings not available skip = False try: from RNA import fold except: if which("RNAfold") == None: try: utils.predict_structures( self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, False) raise RuntimeError( 'predict_structures should have raised an error at this point, but did not' ) except: skip = True # we got an error, as expected #annotate=False if skip == True: return utils.predict_structures(self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, False) if not isfile(gettempdir() + "/test2.fasta"): return with open(self.folder + "/data/rna_pred_ref.fasta", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test2.fasta", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test2.fasta") #annotate=True utils.predict_structures(self.folder + "/data/rna_pred.fasta", gettempdir() + "/test2.fasta", 2, True) if not isfile(gettempdir() + "/test2.fasta"): return with open(self.folder + "/data/rna_pred_ref_annot.fasta", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test2.fasta", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test2.fasta")
def main(): RBPs = [("data/pum2.train.positive.fasta", "data/pum2.train.negative.fasta", "data/pum2.test.positive.fasta", "data/pum2.test.negative.fasta", "PUM2"), ("data/qki.train.positive.fasta", "data/qki.train.negative.fasta", "data/qki.test.positive.fasta", "data/qki.test.negative.fasta", "QKI"), ("data/igf2bp123.train.positive.fasta", "data/igf2bp123.train.negative.fasta", "data/igf2bp123.test.positive.fasta", "data/igf2bp123.test.negative.fasta", "IGF2BP123"), ("data/srsf1.train.positive.fasta", "data/srsf1.train.negative.fasta", "data/srsf1.test.positive.fasta", "data/srsf1.test.negative.fasta", "SRSF1"), ("data/taf2n.train.positive.fasta", "data/taf2n.train.negative.fasta", "data/taf2n.test.positive.fasta", "data/taf2n.test.negative.fasta", "TAF2N"), ("data/nova.train.positive.fasta", "data/nova.train.negative.fasta", "data/nova.test.positive.fasta", "data/nova.test.negative.fasta", "NOVA")] for entry in RBPs: output_folder = entry[4] + "_pysster/" if not os.path.isdir(output_folder): os.makedirs(output_folder) start = time() # predict secondary structures utils.predict_structures(entry[0], entry[0]+".struct.gz", annotate=True) utils.predict_structures(entry[1], entry[1]+".struct.gz", annotate=True) utils.predict_structures(entry[2], entry[2]+".struct.gz", annotate=True) utils.predict_structures(entry[3], entry[3]+".struct.gz", annotate=True) # load data data = Data([entry[0]+".struct.gz", entry[1]+".struct.gz"], ("ACGU", "HIMS")) data.train_val_test_split(0.8, 0.1999) # we need to have at least one test sequence, even though we don't need it print(data.get_summary()) # training params = {"kernel_len": 8} model = Model(params, data) model.train(data) # load and predict test data data_test = Data([entry[2]+".struct.gz", entry[3]+".struct.gz"], ("ACGU", "HIMS")) predictions = model.predict(data_test, "all") stop = time() print("{}, time in seconds: {}".format(entry[4], stop-start)) # performance evaluation labels = data_test.get_labels("all") utils.plot_roc(labels, predictions, output_folder+"roc.pdf") utils.plot_prec_recall(labels, predictions, output_folder+"prec.pdf") print(utils.get_performance_report(labels, predictions)) # get motifs activations = model.get_max_activations(data_test, "all") logos, scores = [], [] for kernel in range(model.params["kernel_num"]): logo, score = model.visualize_kernel(activations, data_test, kernel, output_folder) logos.append(logo) scores.append(score) # sort motifs by importance score sorted_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])] with open(output_folder+"kernel_scores.txt", "wt") as handle: for x in sorted_idx: print("kernel {:>3}: {:.3f}".format(x, scores[x])) handle.write("kernel {:>3}: {:.3f}\n".format(x, scores[x])) # save model to drive utils.save_model(model, "{}model.pkl".format(output_folder))
alu[i].seq = alu[i].seq.transcribe() rep = [rec for rec in SeqIO.parse("rep.fasta", "fasta") if rec.seq[150] == "A"] for i in range(len(rep)): rep[i].seq = rep[i].seq.transcribe() nonrep = [ rec for rec in SeqIO.parse("nonrep.fasta", "fasta") if rec.seq[150] == "A" ] for i in range(len(nonrep)): nonrep[i].seq = nonrep[i].seq.transcribe() random.seed(42) SeqIO.write(random.sample(alu, 50000), "alu.fasta", "fasta") SeqIO.write(random.sample(rep, 50000), "rep.fasta", "fasta") SeqIO.write(random.sample(nonrep, 50000), "nonrep.fasta", "fasta") print("predict secondary structures...") utils.predict_structures("alu.fasta", "alu.fa.gz", 15, True) utils.predict_structures("rep.fasta", "rep.fa.gz", 15, True) utils.predict_structures("nonrep.fasta", "nonrep.fa.gz", 15, True) print("clean up...") os.remove("grch37.fasta") os.remove("grch37.fasta.fai") os.remove("rep.bed.gz") os.remove("alu.fasta") os.remove("rep.fasta") os.remove("nonrep.fasta") os.remove("alu.bed.gz") os.remove("nonrep.bed.gz") os.remove("rep.bed.gz") os.remove("table1_full.txt.gz")