# get model_id, model_name, filename, email in command line input model_id = sys.argv[1] model_name = sys.argv[2] filename = sys.argv[3] email = sys.argv[4] media_directory = 'media/' model_directory = 'saved_models/' training_file = pd.read_csv(media_directory + filename, delimiter=',') train_y = pd.DataFrame(training_file['label'].astype(np.int8), columns=['label']) feature_start_time = time.time() pos_ind = position_independent(training_file, 4).astype(np.int8) pos_spe = position_specific(training_file, 4).astype(np.int8) feature_end_time = time.time() print('Feature generation time: ' + str(feature_end_time - feature_start_time)) train_x = pd.concat([pos_ind, pos_spe], axis=1, sort=False) rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=1, verbose=2) steps = [('SFM', SelectFromModel(estimator=rf, max_features=2899, threshold=-np.inf)), ('scaler', StandardScaler()), ('SVM', SVC(C=1,
os.remove(static_directory + 'user_' + str(user_id) + '/' + model_id + '_roc_curve.png') f = open(model_directory + model_id + '.pkl', 'rb') model = pkl.load(f) test_file = pd.read_csv(media_directory + prediction_file, delimiter=',') test_file_x = test_file['sgRNA'] test_file_y = pd.DataFrame(data=[]) if test_file.shape[1] == 2: test_file_y = test_file['label'] feature_start_time = time.time() pos_ind = position_independent(test_file, 4).astype(np.int8) pos_spe = position_specific(test_file, 4).astype(np.int8) if str(model_type) == '1': test_x = pd.concat([pos_ind, pos_spe], axis=1, sort=False) else: gap = gap_features(test_file) test_x = pd.concat([pos_ind, pos_spe, gap], axis=1, sort=False) feature_end_time = time.time() print('Feature generation time: ' + str(feature_end_time - feature_start_time)) prediction_start_time = time.time() prediction_y = model.predict(test_x) prediction_y_proba = model.predict_proba(test_x) prediction_end_time = time.time() print('Prediction time: ' + str(prediction_end_time - prediction_start_time))