def SVM(X, y, X_ind, y_ind, is_reg=False): """Cross Validation and independent set test for Support Vector Machine (SVM) Arguments: X (ndarray): Feature data of training and validation set for cross-validation. m X n matrix, m is the No. of samples, n is the No. of fetures y (ndarray): Label data of training and validation set for cross-validation. m-D vector, and m is the No. of samples. X_ind (ndarray): Feature data of independent test set for independent test. It has the similar data structure as X. y_ind (ndarray): Feature data of independent set for for independent test. It has the similar data structure as y out (str): The file path for saving the result data. is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False) Returns: cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples. inds (ndarray): independent test results. It has similar data structure as cvs. """ if is_reg: folds = KFold(5).split(X) model = SVR() else: folds = StratifiedKFold(5).split(X, y) model = SVC(probability=True) cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) gs = GridSearchCV(model, { 'C': 2.0**np.array([-5, 15]), 'gamma': 2.0**np.array([-15, 5]) }, n_jobs=5) gs.fit(X, y) params = gs.best_params_ print(params) for i, (trained, valided) in enumerate(folds): model = SVC(probability=True, C=params['C'], gamma=params['gamma']) model.fit(X[trained], y[trained]) if is_reg: cvs[valided] = model.predict(X[valided]) inds += model.predict(X_ind) else: cvs[valided] = model.predict_proba(X[valided])[:, 1] inds += model.predict_proba(X_ind)[:, 1] return cvs, inds / 5
class MySVM: def __init__(self, name, data, problem_type, load_from_file=False): self.name = name self._unpack_data(data) self.problem_type = problem_type if load_from_file: self._load_model(self.name) else: if problem_type == 'classification': self.model = SVC(probability=True) else: self.model = SVR() def find_best_model(self, param_grid, save=False): search = GridSearchCV(self.model, param_grid=param_grid, cv=10, verbose=1, n_jobs=-1) search.fit(self.X_train, self.y_train) print(search.best_params_) self.model = search.best_estimator_ if save: print("saving model") dump(self.model, f'models/{self.name}.joblib') return search def train(self, X, y): self.model.fit(X, y) def predict(self, X): if self.problem_type == "classification": return self.model.predict_proba(X) elif self.problem_type == "regression": return self.model.predict(X).reshape(-1, 1) def _unpack_data(self, data): self.X_train = data[0] self.y_train = data[1] self.X_test = data[2] self.y_test = data[3] def _load_model(self, name): self.model = load(f'models/{name}.joblib')
ynew = model.predict(Xnew) print("X=%s, Predicted=%s" % (Xnew[0], ynew[0])) # In[ ]: from sklearn.linear_model import LogisticRegression from sklearn.datasets.samples_generator import make_blobs # generate 2d classification dataset X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1) # fit final model model = LogisticRegression() model.fit(X, y) # new instances where we do not know the answer Xnew, _ = make_blobs(n_samples=3, centers=2, n_features=2, random_state=1) # make a prediction ynew = model.predict_proba(Xnew) # show the inputs and predicted probabilities for i in range(len(Xnew)): print("X=%s, Predicted=%s" % (Xnew[i], ynew[i])) # In[15]: #creating model import pandas from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib dataframe = pandas.read_csv( "/home/seethalprince/cdc/CDC_Intern/Dataset/Data1.csv") array = dataframe.values
def cv(dataset, summary_df, cddd_model_dir, molbert_model_dir): df, indices = get_data(dataset) cddd = InferenceModel(cddd_model_dir) # type: ignore molbert = MolBertFeaturizer(molbert_model_dir, embedding_type='average-1-cat-pooled', max_seq_len=200, device='cpu') # type: ignore ecfp = MorganFPFeaturizer(fp_size=2048, radius=2, use_counts=True, use_features=False) rdkit_norm = PhysChemFeaturizer(normalise=True) cddd_fn = lambda smiles: cddd.seq_to_emb(smiles) molbert_fn = lambda smiles: molbert.transform(smiles)[0] ecfp_fn = lambda smiles: ecfp.transform(smiles)[0] rdkit_norm_fn = lambda smiles: rdkit_norm.transform(smiles)[0] for i, (train_idx, valid_idx, test_idx) in enumerate(indices): train_df = df.iloc[train_idx] valid_df = df.iloc[valid_idx] # combine train and valid set as SVMs don't use a validation set, but NNs do. # this way they use the same amount of data. train_df = pd.concat([train_df, valid_df]) test_df = df.iloc[test_idx] fn_combos = [('cddd', cddd_fn), ('molbert', molbert_fn), ('ECFP4', ecfp_fn), ('rdkit_norm', rdkit_norm_fn)] for feat_name, feat_fn in fn_combos: train_features = np.vstack([ feat_fn(batch) for batch in batchify(train_df['SMILES'], 256) ]) train_labels = train_df[df.columns[-1]] test_features = np.vstack( [feat_fn(batch) for batch in batchify(test_df['SMILES'], 256)]) test_labels = test_df[df.columns[-1]] mode = summary_df[summary_df['task_name'] == dataset].iloc[0]['task_type'].strip() np.random.seed(i) if mode == 'regression': model = SVR(C=5.0) elif mode == 'classification': model = SVC(5.0, probability=True) else: raise ValueError( f'Mode has to be either classification or regression but was {mode}.' ) model.fit(train_features, train_labels) predictions = model.predict(test_features) if mode == 'classification': # predict probabilities (needed for some metrics) and get probs of positive class ([:, 1]) prob_predictions = model.predict_proba(test_features)[:, 1] metrics_dict = { 'AUROC': lambda: metrics.roc_auc_score(test_labels, prob_predictions ), 'AveragePrecision': lambda: metrics.average_precision_score( test_labels, prob_predictions), 'Accuracy': lambda: metrics.accuracy_score(test_labels, predictions), } else: metrics_dict = { 'MAE': lambda: metrics.mean_absolute_error( test_labels, predictions), 'RMSE': lambda: np.sqrt( metrics.mean_squared_error(test_labels, predictions)), 'MSE': lambda: metrics.mean_squared_error(test_labels, predictions ), 'R2': lambda: metrics.r2_score(test_labels, predictions), } metric_values = {} for name, callable_metric in metrics_dict.items(): try: metric_values[name] = callable_metric() except Exception as e: print(f'unable to calculate {name} metric') print(e) metric_values[name] = np.nan default_path = os.path.join( './logs/', datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')) output_dir = os.path.join(default_path, dataset, str(i)) os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, f'{feat_name}_metrics.json'), 'w+') as fp: json.dump(metric_values, fp)
class Log_reg(): def __init__(self): self.log_reg = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) self.ss = StandardScaler() def read_features_csv(self, file, My_Model): # 需要注意第一行数据未读取 df = pd.read_csv(file) columns_size = df.columns.size if My_Model == 0: X = df.iloc[:, 1:columns_size - args.k - 2] else: X = pd.concat([ df.iloc[:, 1:columns_size - 2 * args.k - 2], df.iloc[:, columns_size - args.k - 2:-2] ], axis=1) Y = df.iloc[:, -2] # print(X.head()) # print(X,Y) return X, Y # print(df.head()) def fit_logistic_regression(self, train_file): x, y = self.read_features_csv(train_file, args.myModel) # ss = StandardScaler() x = self.ss.fit_transform(x) # print(x,y) # log_reg = LogisticRegression() self.log_reg.fit(x, y) # y_pre = log_reg.predict_proba(x) # return log_reg def predict_test(self, test_file): x, y = self.read_features_csv(test_file, args.myModel) x = self.ss.fit_transform(x) y_p_pre = self.log_reg.predict_proba(x) y_pre = self.log_reg.predict(x) accuracy = accuracy_score(y, y_pre) precision = precision_score(y, y_pre) recall = recall_score(y, y_pre) F1 = f1_score(y, y_pre) scores = self.log_reg.score(x, y) print("accuracy, precision, recall, F1:", accuracy, precision, recall, F1) print("scores:", scores) return y_p_pre, y