# Import data as pandas dataframe if test_data_gen == 'False' or test_data_gen == 'false': print("Parsing data...") print("\n") data = pdp.pre_vec_parser(filepath, window_size) # Vectorise data print("Vectorising data...") print("\n") if use_pssm == False: X_test, Y_test = ddp.skl_parser(data, window_size) else: pssm_loc = repo_loc + 'SignalP/input/test_pssms/' data_pssm = pdp.pssm_parser(data, window_size, pssm_loc) X_test, Y_test = pdp.skl_pssm_parser(data, window_size, pssm_type='freq') # Test model scores = oD() scores['labels'] = np.array([-1, 0, 1]) model = joblib.load(inpath + model_name)
# Import data as pandas dataframe print("Parsing data...") print("\n") data = pdp.pre_vec_parser(filepath, window_size) # Vectorise data print("Vectorising data...") print("\n") if use_pssm == False: X, Y = ddp.skl_parser(data, window_size) else: data_pssm = pdp.pssm_parser(data, window_size, pssm_loc) X, Y = pdp.skl_pssm_parser(data_pssm, window_size, pssm_type=pssm_type) # Train model model = RandomForestClassifier(n_estimators = n_estimators, class_weight='balanced') print("Training this magnificient model...") print("\n") fit_start = time.time() scores = oD() scores['labels'] = np.array([-1,0,1])
# Starting script start = time.time() f1_scorer = make_scorer(f1_score, labels=[-1,1], average='macro') final_list = oD() for windows in window_size: data = ddp.pre_vec_parser(filepath, windows) clf = LinearSVC(class_weight = 'balanced') X, Y = ddp.skl_parser(data, windows) parameters = {"C": [1,2,4,8] } model_tunning = GridSearchCV(clf, param_grid=parameters, scoring=f1_scorer, n_jobs=-3) model_tunning.fit(X,Y) s = model_tunning.best_score_ p = model_tunning.best_params_ final_list[str(windows)] = [s,p['C']] end = time.time() best_table = pd.DataFrame.from_dict(final_list, orient='index')
clf = LinearSVC(class_weight='balanced') scores = oD() scores['labels'] = np.array([-1, 0, 1]) p = np.zeros(3) r = np.zeros(3) f = np.zeros(3) s = np.zeros(3) ft = np.zeros(3) for train_data, test_data in ddp.cv_data_gen(data, cv_sets, randomise=False): X_train, Y_train = ddp.skl_parser(train_data, windows) X_test, Y_test = ddp.skl_parser(test_data, windows) fit_start = time.time() clf.fit(X_train, Y_train) fit_end = time.time() fit_time = fit_end - fit_start predicted = clf.predict(X_test) precision, recall, fscore, support = score(Y_test, predicted) p = p + precision r = r + recall f = f + fscore