def predictSequences(fasta_file_path, print_fn=print_fn, out_dir="RF-HOT", threshold=0.5, model_type="RF-HOT"): start_time = time.time() log_file = os.path.join(out_dir, "sequences.log.txt") if print_fn is None: print("NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.") print_fn = print if not os.path.exists(fasta_file_path): raise ValueError( "FASTA PATH {} DOES NOT EXISTS.".format(fasta_file_path)) os.makedirs(out_dir, exist_ok=True) print_fn("\n\n READING FASTA FILE: {}".format(fasta_file_path), log_file) seqs_file = open(fasta_file_path, "r") seqs_file_content = seqs_file.read() predictSequencesFromString(sequences_str=seqs_file_content, print_fn=print_fn, out_dir=out_dir, threshold=threshold, model_type=model_type, log_file=log_file, start_time=start_time)
def parseSequences(seqs, sequence_length=40, tokenizer_path="./models/tokenizer.data", data_type="RF-HOT", log_file=None, print_fn=print_fn): if (len(seqs) == 0): raise ValueError("NO SEQUENCES TO PARSE. " + str(seqs)) return if log_file is None: print_fn = print print_fn("\n\n NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.", log_file) sample, sample_len = seqs[0], len(seqs[0]) seqs = np.array([s.upper() for s in seqs]) print_fn( "\n\n INPUT SHAPE {} \nSAMPLE WITH LEN {}: \n{}".format( len(seqs), sample_len, sample), log_file) if (sample_len > sequence_length): seqs = np.array([s[0:sequence_length] for s in seqs]) print_fn( "\n\n OUTPUT SAMPLE WITH LEN {}: \n{}".format(seqs.shape, seqs[0]), log_file) print_fn("\n\n CONVERTING DATA".format(), log_file) data_df = dataConverter(seqs=seqs, data_type=data_type, tokenizer_path=tokenizer_path, print_fn=print_fn, log_file=log_file) return data_df
def predictSequencesFromString(sequences_str, print_fn=print_fn, out_dir="RF-HOT", threshold=0.5, model_type="RF-HOT", log_file=None, start_time=time.time()): if log_file is None: print_fn = print print_fn("\n\n NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.", log_file) seqs_str = StringIO(sequences_str) parsed_seqs = SeqIO.parse(seqs_str, "fasta") seqs_data = np.array([{ "id": s.id, "seq": str(s.seq) } for s in parsed_seqs]) print_fn( "\n\n SAMPLE: \n\n{}".format( seqs_data[0] if len(seqs_data) > 0 else seqs_data), log_file) chroms = [s["id"] for s in seqs_data] seqs = [s["seq"] for s in seqs_data] print_fn("\n\n # SEQS: {}. SAMPLE: {}".format(len(seqs), seqs[0]), log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn("\n\n CONVERTING SEQUENCES TO {} DATA TYPE".format(model_type), log_file) X = parseSequences(seqs, print_fn=print_fn, data_type=model_type, log_file=log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) model = None if model_type == "RF-HOT" or model_type == "RF-TETRA": model_path = os.path.join("models", "{}.model".format(model_type)) print_fn("\n\n LOADING ML MODEL {}".format(model_path), log_file) model = joblib.load(model_path) if model_type == "GRU" or model_type == "LSTM": model_version = "0" if model_type == "GRU" else "3" model_path = os.path.join("models", "{}-{}.h5".format(model_type, model_version)) print_fn("\n\n LOADING ML MODEL {}".format(model_path), log_file) model = tf.keras.models.load_model(model_path) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn("\n\n PREDICTING SEQUNCES USING: \n\n{}".format(model), log_file) y_probs = model.predict_proba(X) y_pred = y_probs[:, 1] if y_probs.shape[1] == 2 else y_probs[:, 0] df = pd.DataFrame({"CHROM": chroms, "SEQ": seqs, "PRED": y_pred}) pred_file_path = os.path.join(out_dir, "sequences_predictions.csv") print_fn( "\n\t PREDICTIONS GENERATED SUCCESSFULLY. SAMPLE: \n\n{}. \n\nSAVED AT {}" .format(df.head(), pred_file_path), log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) df.to_csv(pred_file_path, index=None, sep='\t', columns=None) return chroms, seqs, y_pred, X
def parseGenome40NTSequences(fasta_file_path, out_dir="RF-HOT", promoter_size=40, step_size=1, test_sample_size=None, data_type="RF-HOT", tokenizer_path="./models/tokenizer.data", print_fn=print_fn): if print_fn is None: print("NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.") print_fn = print if (out_dir is None): raise "VARIABLE out_dir is None. Please specify the output folder.".format( ) if not os.path.exists(fasta_file_path): raise ValueError( "FASTA PATH {} DOES NOT EXISTS.".format(fasta_file_path)) parent_folder = Path(fasta_file_path).stem log_file = os.path.join(out_dir, "parse.log.txt") start_time = time.time() os.makedirs(out_dir, exist_ok=True) print_fn("\n\n CREATING OUTPUT FOLDER: {}".format(out_dir), log_file) chrom, cutted_seqs = genomeSlidingWindow(fasta_file_path=fasta_file_path, log_file=log_file, promoter_size=promoter_size, step_size=step_size, print_fn=print_fn) if test_sample_size is not None: print_fn( "\n\n TEST SAMPLE OF SIZE: {:,} REQUESTED. REDUCING SEQUENCES FROM {:,} TO {:,}" .format(test_sample_size, cutted_seqs.shape[0], test_sample_size), log_file) cutted_seqs = cutted_seqs[:test_sample_size] print_fn( "\n\n CONVERTING {} CUTTED {} NT SEQUENCES TO {} SEQUENCES USING MAPPING VALUES \n\n\t{}" .format(len(cutted_seqs), promoter_size, data_type, [{ nt: charToBinary(nt) } for nt in "AGCT"]), log_file) print_fn("\n\n CONVERTING DATA".format(), log_file) X = dataConverter(seqs=cutted_seqs, data_type=data_type, tokenizer_path=tokenizer_path, print_fn=print_fn, log_file=log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n {} SEQUENCES GENERATED SUCCESSFULLY. OUTPUT DATAFRAME SHAPE: {}". format(data_type, X.shape), log_file) print_fn("\n\n SAMPLE: \n\n{}".format(X.head()), log_file) forward_strand_output_path = os.path.join(out_dir, "{}.data".format(data_type)) chrom_output_path = os.path.join(out_dir, "CHROM.data") seqs_output_path = os.path.join(out_dir, "SEQS.data") print_fn( "\n\n SAVING FORWARD STRAND HOT-ENCODED SEQUENCES TO BINARY FILE USING JOBLIB TO: {} " .format(forward_strand_output_path), log_file) joblib.dump(chrom, chrom_output_path) joblib.dump(cutted_seqs, seqs_output_path) joblib.dump(X, forward_strand_output_path) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n FILE SAVED SUCCESSFULLY AT: \n\t{}".format( forward_strand_output_path), log_file) print_fn("\n\n GENERATING INVERSE STRAND SEQUENCES. ".format(), log_file) inv_cutted_seqs = np.array([ str(Seq.Seq(cutted_seqs[i_s][::-1]).complement()) for i_s in progressbar.progressbar(range(0, len(cutted_seqs))) ]) print_fn( '\n\n INVERSE STRAND SEQUENCES GENERATED SUCCESSFULLY. # OF SAMPLES: {:,}. \n\tSAMPLE: \n\t\tORIGINAL : {} \n\t\tINVERSE : {}' .format( inv_cutted_seqs.shape[0], cutted_seqs[0], inv_cutted_seqs[0], ), log_file) inv_seqs_output_path = os.path.join(out_dir, "SEQS-INV.data") joblib.dump(inv_cutted_seqs, inv_seqs_output_path) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n CONVERTING {} INVERSE STRAND {} NT SEQUENCES TO {} SEQUENCES USING MAPPING VALUES \n\n\t{}" .format(len(inv_cutted_seqs), promoter_size, data_type, [{ nt: charToBinary(nt) } for nt in "AGCT"]), log_file) print_fn("\n\n CONVERTING INVERSE DATA".format(), log_file) X_INV = dataConverter(seqs=inv_cutted_seqs, data_type=data_type, tokenizer_path=tokenizer_path, print_fn=print_fn, log_file=log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n {} SEQUENCES GENERATED SUCCESSFULLY. OUTPUT DATAFRAME SHAPE: {}". format(data_type, X_INV.shape), log_file) print_fn("\n\n SAMPLE: \n\n{}".format(X_INV.head()), log_file) inverse_strand_output_path = os.path.join(out_dir, "{}-INV.data".format(data_type)) print_fn( "\n\n SAVING INVERSE STRAND SEQUENCES TO BINARY FILE USING JOBLIB TO: {} " .format(inverse_strand_output_path), log_file) joblib.dump(X_INV, inverse_strand_output_path) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n FILE SAVED SUCCESSFULLY AT: \n\t{}".format( inverse_strand_output_path), log_file) return X
def predictGenomeSequences( input_dir="RF-HOT", out_dir="RF-HOT", model_type="RF-HOT", threshold=0.5, print_fn=print_fn, ): if print_fn is None: print("NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.") print_fn = print log_file = os.path.join(out_dir, "predict.log.txt") chrom_output_path = os.path.join(input_dir, "CHROM.data") seqs_output_path = os.path.join(input_dir, "SEQS.data") inv_seqs_output_path = os.path.join(input_dir, "SEQS-INV.data") forward_strand_seqs_file = os.path.join(input_dir, "{}.data".format(model_type)) inverse_strand_seqs_file = os.path.join(input_dir, "{}-INV.data".format(model_type)) if not os.path.exists(out_dir): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(out_dir)) if not os.path.exists(forward_strand_seqs_file): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(forward_strand_seqs_file)) if not os.path.exists(inverse_strand_seqs_file): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(inverse_strand_seqs_file)) if not os.path.exists(chrom_output_path): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(chrom_output_path)) if not os.path.exists(seqs_output_path): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(seqs_output_path)) if not os.path.exists(inv_seqs_output_path): raise ValueError( "FILE PATH {} DOES NOT EXISTS. PLEASE PARSE THE GENOME FILE FIRST." .format(inv_seqs_output_path)) start_time = time.time() model = None if model_type == "RF-HOT" or model_type == "RF-TETRA": model_path = os.path.join("models", "{}.model".format(model_type)) print_fn( "\n\n LOADING MACHINE LEARNING MODEL AT: {} WITH SIZE: {:,.2f} MB". format(model_path, Path(model_path).stat().st_size / 1000000), log_file) model = joblib.load(model_path) if model_type == "GRU" or model_type == "LSTM": model_version = "0" if model_type == "GRU" else "3" model_path = os.path.join("models", "{}-{}.h5".format(model_type, model_version)) print_fn( "\n\n LOADING MACHINE LEARNING MODEL AT: {} WITH SIZE: {:,.2f} MB". format(model_path, Path(model_path).stat().st_size / 1000000), log_file) model = tf.keras.models.load_model(model_path) print_fn("\n\n{}\n\n".format(str(model)), log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n LOADING FORWARD STRAND SEQUENCES CONVERTED TO HOT-ENCODED SEQUENCES: {} WITH SIZE: {:,.2f} MB" .format(forward_strand_seqs_file, Path(forward_strand_seqs_file).stat().st_size / 1000000), log_file) X = joblib.load(forward_strand_seqs_file) print_fn( "\n\n LOADING INVERSE STRAND SEQUENCES CONVERTED TO HOT-ENCODED SEQUENCES AT: {} WITH SIZE: {:,.2f} MB" .format(inverse_strand_seqs_file, Path(inverse_strand_seqs_file).stat().st_size / 1000000), log_file) X_INV = joblib.load(inverse_strand_seqs_file) print_fn( "\n\n FORWARD STRAND SEQS: {} \nINVERSE STRAND SEQS: {} \nML-MODEL: \n\n{}" .format(X.shape, X_INV.shape, str(model)), log_file) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n GENERATING PREDICTIONS FOR FORWARD STRAND SEQUENCES WITH SHAPE: {}" .format(X.shape), log_file) y_probs = model.predict_proba(X) y_pred = y_probs[:, 1] if y_probs.shape[1] == 2 else y_probs[:, 0] print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn( "\n\n GENERATING PREDICTIONS FOR INVERSE STRAND SEQUENCES WITH SHAPE: {}" .format(X_INV.shape), log_file) y_inv_probs = model.predict_proba(X_INV) y_inv_pred = y_inv_probs[:, 1] if y_inv_probs.shape[ 1] == 2 else y_inv_probs[:, 0] print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) print_fn("\n\n PREDICTIONS GENERATED SUCCESSFULLY".format(), log_file) print_fn( "\n\n FORWARD SEQUENCE SAMPLE: \n\tSEQ: \n\n{} \n\tPREDICTION: {:.4f}\n" .format("".join(map(str, X.iloc[0].values)), y_pred[0]), log_file) print_fn( "\n\n INVERSE SEQUENCE SAMPLE: \n\tSEQ: {} \n\tPREDICTION: {:.4f}\n". format("".join(map(str, X_INV.iloc[0].values)), y_inv_pred[0]), log_file) print_fn( "\n\n FORWARD STRAND PREDICTIONS ABOVE THRESHOLD: {:,} and BELOW: {:,} FROM TOTAL {:,}" .format(len(y_pred[y_pred >= threshold]), len(y_pred[y_pred < threshold]), len(y_pred)), log_file) print_fn( "\n\n INVERSE STRAND PREDICTIONS ABOVE THRESHOLD: {:,} and BELOW: {:,} FROM TOTAL {:,}" .format(len(y_inv_pred[y_inv_pred >= threshold]), len(y_inv_pred[y_inv_pred < threshold]), len(y_inv_pred)), log_file) chrom = joblib.load(chrom_output_path) seqs = joblib.load(seqs_output_path) inv_seqs = joblib.load(inv_seqs_output_path) print_fn( "\n\n GENERATING DETECTED PROMOTERS' BED FILE BASED ON THRESHOLD: {:.3f} FOR CHROM: {}. # SEQS: {} # INV SEQS: {}. # SEQS ABOVE THRES: {} # INV SEQS ABOVE THRES: {}" .format(threshold, chrom, len(seqs), len(inv_seqs), len(y_pred[y_pred >= threshold]), len(y_inv_pred[y_inv_pred >= threshold])), log_file) df = pd.DataFrame( columns=['chrom', 'start', 'end', 'score', 'strand', 'sequence']) for i_s, s in enumerate(seqs): pred_score = y_pred[i_s] if pred_score > threshold: df = df.append( { 'chrom': chrom, 'start': i_s, 'end': i_s + 39, 'score': np.round(pred_score, 5), 'strand': "+", 'sequence': seqs[i_s] }, ignore_index=True) inv_pred_score = y_inv_pred[i_s] if inv_pred_score > threshold: df = df.append( { 'chrom': chrom, 'start': i_s, 'end': i_s + 39, 'score': np.round(inv_pred_score, 5), 'strand': "-", 'sequence': inv_seqs[i_s] }, ignore_index=True) pred_file_path = os.path.join(out_dir, "genome_predictions.csv") print_fn( "\n\n SAVING BED FILE WITH SHAPE {} TO {}. SAMPLE: \n\n{}".format( df.shape, pred_file_path, df.head()), log_file) df.to_csv(pred_file_path, index=None, sep='\t', columns=None) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file)
def genomeSlidingWindow(fasta_file_path, log_file=None, promoter_size=40, step_size=1, print_fn=print): if log_file is None: print("NO LOG FILE SPECIFIED. REDIRECTING OUTPUT TO CONSOLE.") print_fn = print if (fasta_file_path is None): raise "VARIABLE fasta_file_path is None. Please specify the genome FASTA file.".format( ) start_time = time.time() seqs_file = open(fasta_file_path, "r") seqs_file_content = seqs_file.read() seqs_file_str = StringIO(seqs_file_content) parsed_seqs = SeqIO.parse(seqs_file_str, "fasta") seqs_data = np.array([{ "id": s.id, "seq": str(s.seq) } for s in parsed_seqs]) chroms = [s["id"] for s in seqs_data] genomes = [s["seq"] for s in seqs_data] # CLEANING UP TO SAVE MEMORY if seqs_file_str is not None: del seqs_file_str if seqs_data is not None: del seqs_data print_fn("\n\n PRINTING CONTENT".format(), log_file) for i_s, s in enumerate(genomes): print_fn( "{}. GENOME: {} - LENGTH: {}".format(i_s + 1, chroms[i_s], len(s)), log_file) print_fn( "\n\n JOINING ALL CHROMS AND SEQS INTO A SINGLE FOR TETRA-NUCLEOTIDE SLIDING WINDOW" .format(), log_file) chrom = ",".join(chroms) genome = "".join(genomes) # CLEANING UP TO SAVE MEMORY if chroms is not None: del chroms if genomes is not None: del genomes print_fn( "\n\n JOINED GENOME: {} - LENGTH: {:,}".format(chrom, len(genome)), log_file) n_samples = len(genome) - promoter_size - 1 print_fn( "\n\n GENERATING PROMOTER SEQUENCES WITH WINDOW-SIZE: {} AND STEP: {}. EXPECTED SAMPLES: {:,}\n\n" .format(promoter_size, step_size, n_samples), log_file) cutted_seqs = np.array([ genome[i:i + promoter_size] for i in progressbar.progressbar(range(0, n_samples, step_size)) ]) print_fn( "\n\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))), log_file) # CLEANING UP TO SAVE MEMORY if genome is not None: del genome print_fn( '\n\n CUTTED {} NT SEQUENCES GENERATED SUCCESSFULLY. # OF SAMPLES: {:,} = {}. \n\tSAMPLE #1: {} \n\tSAMPLE #2: {} ' .format(promoter_size, cutted_seqs.shape[0], cutted_seqs.shape, cutted_seqs[0] if len(cutted_seqs) > 0 else None, cutted_seqs[1] if len(cutted_seqs) > 1 else None), log_file) return chrom, cutted_seqs