def optimise_model(x_train, x_val, y_train, y_val, penalty, grid): ''' Optimise model with a grid search, using hold out set for validation, and evaluating with AUROC. ''' if penalty == 'l2': model = OneVsRestClassifier(LogisticRegression(max_iter=2000)) elif penalty == 'l1': model = OneVsRestClassifier( LogisticRegression(max_iter=2000, penalty='l1', solver='saga', tol=0.01)) # keeps training time down # perform grid search best_C = grid_search(x_train, x_val, y_train, y_val, model, grid) # concatenate training and validation sets x = np.concatenate((x_train, x_val)) y = np.concatenate((y_train, y_val)) # train final model on all non-test data with optimal C model.set_params(estimator__C=best_C) start = time.time() model.fit(x, y) end = time.time() train_time = round(end - start, 2) return model, train_time
def refit_model(model_instance, best_params: dict, x_train: list, y_train: list, binary: bool): if binary: model_instance = OneVsRestClassifier(model_instance) model_instance.set_params(**best_params) model_instance.fit(x_train, y_train) return model_instance
def optimize(log, filename, progress=False): log.info("getting data") data, labels = extract_mails.get_training_data(progress) log.info("splitting data") x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.4, random_state=0) log.info("preprocessing data") vectorizer = CountVectorizer() vectorizer.fit(data) X = vectorizer.transform(x_train) binarizer = MultiLabelBinarizer() binarizer.fit(labels) Y = binarizer.transform(y_train) # do a gridsearch for the best parameters log.info("doing gridsearch... this may take some time") pipe = Pipeline([ ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=10))), ('classification', SVC()) ]) clf = OneVsRestClassifier(pipe) parameters = { "estimator__feature_selection__threshold": ('mean', '0.5*mean', 0), "estimator__classification__kernel": ('linear', 'rbf'), "estimator__classification__C": (0.01, 0.1, 1, 10, 100) } grid_search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=1, scoring='f1_samples', error_score=0) grid_search.fit(X, Y) print grid_search.best_score_ best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print "\t{0}: {1}".format(param_name, best_parameters[param_name]) log.info("evaluating classifier") Xt = vectorizer.transform(x_test) preds = grid_search.best_estimator_.predict(Xt) real = binarizer.transform(y_test) print classification_report(real, preds, target_names=binarizer.classes_) # store the parameters from the best estimator and the pipeline, # so that the next time for training the best pipeline can be # used! clf.set_params(**best_parameters) atomic_pickle(clf, filename) return data, labels
def trainPredictorWithWeights(X,Y,ws): #print type(X) #print type(Y) #classif = BinaryRelevance(GaussianNB()) # train #classif.fit(X,Y) #classif = OneVsRestClassifier(SVC(kernel='linear')) classif = OneVsRestClassifier(SVC(kernel='linear')) classif.set_params(sample_weight=ws) # print classif.get_params() classif.fit(X, Y) return classif
def main(): (trainX, trainY) = readTrainData() (first10columnX, scalars) = normalizeData(trainX[:, 0:10]) clf = OneVsRestClassifier(svm.SVC(max_iter=16000)) clfbest = clf.set_params(estimator__kernel='rbf',estimator__gamma=1.0,estimator__C=2.0) trainX = np.concatenate((first10columnX[:,:], trainX[:,10:50]), axis = 1) clfbest.fit(trainX, trainY) ''' the cross validation can be done manually but extremely time consuming on my 2013 Mac, it will take at least 90 min the parameters are chosen but a cross validation with much smaller input data. ''' (validateX, validateY) = readTrainData("../forest_validation.csv") first10validateX = normalizeData(validateX[:, 0:10], scalars)[0] validateX = np.concatenate((first10validateX[:,:], validateX[:,10:50]), axis = 1) predictY = clfbest.predict(validateX) resultfile = open('confusionmatrix_test.txt','w') resultfile.write(str(confusion_matrix(validateY, predictY))) resultfile.write('\n\n\n') resultfile.write(metrics.classification_report(validateY, predictY)) resultfile.close() print 'accuracy: %.4f' % ( ( sum( predictY == validateY ) / float(len(predictY)))) testX = readTestData() first10testX = normalizeData(testX[:, 0:10], scalars)[0] testX = np.concatenate((first10testX[:,:], testX[:,10:50]), axis = 1) testY = clfbest.predict(testX) write2csv(testY, filename = "output_SVM.csv")
class ModelsClassical(): """ Класс для моделей классического МО """ def __init__(self, type_class='binary', y=None, type_model='lin', **kwargs): """ Выбор модели type_class: {'binary', 'multy'}, default='binary' Тип классификации - бинарная или многоклассовая y: numpy.ndarray, default=None Массив целевой переменной. Нобходим для нахождения уникальных классов в многоклассовой классификации. type_model: {'lin', 'lgbm', 'svm'}, default='lin' Тип модели **kwargs Гиперпараметры для моделей """ if type_model == 'lin': self.model = LogisticRegression(**kwargs) elif type_model == 'lgbm': self.model = LGBMClassifier(boosting_type='gbdt', n_jobs=20, **kwargs) elif type_model == 'svm': # Пока заглушка - метод не реализован pass if type_class != 'binary' and type_model != 'lgbm': self.model = OneVsRestClassifier(self.model) self.mlb = MultiLabelBinarizer(classes=sorted(np.unique(y))) elif type_class != 'binary' and type_model == 'lgbm': self.model.set_params(objective='multiclass', metric='multi_logloss') self.type_class = type_class self.type_model = type_model self.kwargs = kwargs def fit(self, X_train, y_train, X_test=None, y_test=None): """ Обучение модели X_train: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix Тренировочный датасет X_test: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix, default=None Тестовый датасет y_train: pandas.Series/numpy.ndarray ЦП соответствующая тренироврочному датасету y_test: pandas.Series/numpy.ndarray, default=None ЦП соответствующая тестовому датасету """ if self.type_class != 'binary' and self.type_model != 'lgbm': self.y_train = self.mlb.fit_transform(y_train.apply(lambda x: [x])) if self.type_model != 'lgbm': self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100, verbose=False) def predict(self, X, threshold=False): """ Прогнозирование класса X: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix Датасет threshold: Порог для класса return: numpy.ndarray Массив с предсказанными классами """ if threshold: def __proba_to_tag(line, thresh): """ Внутреняя функция для превращения вероятности в соответствие классу согласно порогу. Пример: есть три класса со следующими вероятностями [0.58, 0.32, 0.1]. Первый класс всегда others, в который падают обычно все неинтересующие текста. Класс самый большой. Алогритм результирующего класса следующий. Если вероятность первого класса (others) больше остальных, то берется максимальная вероятность из всех классов, кроме первого (others). Если этот максимум больше порога, то объект относится к этому классу, иначе относится к первому классу. line: numpy.ndarray Массив вероятностей каждого класса thresh: float Порог return: numpy.ndarray """ max_line = max(line) if line[0] == max_line: max_prob = max(line[1:]) if max_prob > thresh: index_max = np.where(line==max_prob)[0][0] else: index_max = 0 else: index_max = np.where(line==max_line)[0][0] for tag in range(len(line)): if tag == index_max: line[tag] = 1 else: line[tag] = 0 return line.astype(int) y_pred_proba = self.model.predict_proba(X) if self.type_class == 'binary': y_final = (y_pred_proba[:,-1] > threshold)*1 else: y_pred = np.apply_along_axis(__proba_to_tag, axis=1, arr=y_pred_proba, thresh=threshold) y_final = self.__transform_multyclass_to_one_list(y_pred) return y_final else: return self.model.predict(X) def predict_proba(self, X): """ Прогнозирование вероятности класса X: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix Датасет return: numpy.ndarray Массив с вероятностями для каждого класса """ return self.model.predict_proba(X) def transform_multy_class(self, y): """ Преобразование массива к необходимому для подбора гиперпараметров виду. Кажый объект массива оборачивается в список. y: numpy.ndarray Целевая переменная return: numpy.ndarray Массив с классами, обернутыми в список для каждого объекта """ if self.type_model != 'lgbm': return self.mlb.fit_transform(y.apply(lambda x: [x])) def __transform_multyclass_to_one_list(self, y): """ Трансформация массива, состоящего из массивов (единица стоит на индексе предсказанного класса), к массиву, состоящему просто из номеров классов. y: numpy.ndarray Целевая переменная return: numpy.ndarray Массив из номеров классов """ def __transform_one_line(line): """ Внутренняя функция для трансформации одного объекта массива line: numpy.ndarray Отдельный масив return: int """ if np.max(line) == 0: return 0 else: return np.where(line == 1)[0][0] return np.apply_along_axis(__transform_one_line, 1, y)
class MultiClassPredictions(InteractionPredictions): """ Class for making and storing OneVsRest predictions of synergies and antagonisms """ def __init__(self, **kwargs): super().__init__(**kwargs) self.class_names = ['none', 'antag', 'syn'] self.one_vs_rest() def one_vs_rest(self): self.clf = OneVsRestClassifier(self.clf) def set_params(self, **kwargs): # for random forest self.clf.set_params(**kwargs) return self def _crossval_iter(self, train, test, cl): X_test = self.X[test] y_test = self.y[test] combs_test = self.combs[test] probas_ = self.clf.fit(self.X[train], self.y[train]).predict_proba(X_test) # predictions pred_dict = {'comb': combs_test} fpr = dict() tpr = dict() roc_auc = dict() precision = dict() recall = dict() average_precision = dict() n_classes = len(self.class_names) for i in range(n_classes): pred_dict['score_' + str(self.class_names[i])] = probas_[:, i] fpr[i], tpr[i], _ = roc_curve(y_test[:, i], probas_[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) precision[i], recall[i], _ = precision_recall_curve( y_test[:, i], probas_[:, i]) average_precision[i] = average_precision_score( y_test[:, i], probas_[:, i]) self.fpr[cl] = fpr self.tpr[cl] = tpr self.auc[cl] = roc_auc self.precision[cl] = precision self.recall[cl] = recall self.avprec[cl] = average_precision self.predicted[cl] = pd.DataFrame(pred_dict) importances = [ self.clf.estimators_[i].feature_importances_ for i in range(n_classes) ] imp_list = [pd.DataFrame({'feat': np.argsort(imp)[::-1][:self.top], 'importance': np.sort(imp)[::-1][:self.top], 'type': i})\ for imp, i in zip(importances, self.class_names)] imp_df = pd.concat(imp_list, ignore_index=True) self.topfeat[cl] = imp_df def aggregate_precision(self): index = ['AP_' + lab for lab in self.class_names] ap_df = (pd.concat({k: pd.DataFrame(v.values(), index=index).T \ for k,v in self.avprec.items()}). reset_index().rename(columns={"level_0": "cvfold"}). drop(columns=["level_1"])) return ap_df def aggregate_auc(self): index = ['AUCROC_' + lab for lab in self.class_names] auc_df = (pd.concat({k: pd.DataFrame(v.values(), index=index).T \ for k,v in self.auc.items()}). reset_index().rename(columns={"level_0": "cvfold"}). drop(columns=["level_1"])) return auc_df def plot_ROC(self, figdir=None, fname=None, title='One-vs-Rest ROC curves', sz=10): class_names = ['none', 'antagonism', 'synergy'] colors = cycle(['#808080', '#FFCC33', '#009999']) n_classes = 3 if figdir is not None and fname is not None: with PdfPages(figdir + fname + '.pdf') as pdf: for cl in list(self.auc.keys()): plt.figure(figsize=(sz, sz)) for i, color in zip(range(n_classes), colors): plt.plot( self.fpr[cl][i], self.tpr[cl][i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(class_names[i], self.auc[cl][i])) plt.plot([0, 1], [0, 1], 'k--', lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title + cl) plt.legend(loc="lower right") pdf.savefig() plt.close() def plot_precision(self, figdir=None, fname=None, title='One-vs-Rest Precision-Recall', sz=10): class_names = ['none', 'antagonism', 'synergy'] colors = cycle(['#808080', '#FFCC33', '#009999']) n_classes = 3 if figdir is not None and fname is not None: with PdfPages(figdir + fname + '.pdf') as pdf: for cl in list(self.avprec.keys()): plt.figure(figsize=(sz, sz)) f_scores = np.linspace(0.2, 0.8, num=4) for f_score in f_scores: x = np.linspace(0.01, 1) y_ = f_score * x / (2 * x - f_score) plt.plot(x[y_ >= 0], y_[y_ >= 0], color='gray', alpha=0.2, label='iso-F1 curves') plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y_[45] + 0.02)) for i, color in zip(range(n_classes), colors): plt.plot( self.recall[cl][i], self.precision[cl][i], color=color, lw=2, label= 'Precision-recall of class {0} (area = {1:0.2f})' ''.format(class_names[i], self.avprec[cl][i])) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title(title + cl) plt.legend(loc="lower right") pdf.savefig() plt.close() def save_metrics(self, outdir=None, fname=None): auc_df = self.aggregate_auc() ap_df = self.aggregate_precision() metrics = pd.merge(auc_df, ap_df, on='cvfold', how='inner') if outdir is not None and fname is not None: metrics.to_csv(outdir + fname + '.tsv', sep="\t", index=False)
def run_classifier(clf,features,cases,bottom_inds,optimize_hyperparams=False): clf_name = clf.__class__.__name__ cases = np.array(cases) # Set up the cross_validation study if clf_name == 'CollaborativeFilter': cases = np.array(preprocess_recommendations(cases)) cy = [c[2] for c in cases] cases = np.array(cases) m_ind=cases[:,1] else: cy = cases[:,0] # Pre-Run Hyperparameter Optimization if optimize_hyperparams: param_dist = hyper_params[clf_name] else: opt_param_dist = optimal_params[clf_name] num_iterations = 1 if optimize_hyperparams else 100 shuffle = cross_validation.StratifiedShuffleSplit(y=cy, n_iter=num_iterations, test_size=0.1, random_state=None) scores =[]; Y_pred = []; Y_true = []; m_test_inds=[] # Run study for i,(train_index, test_index) in enumerate(shuffle): # Separate training/test set if (i%10)==0: print ' CV#%d of %d...'%(i,num_iterations) Y_train, Y_test = (cases[train_index],cases[test_index]) # Fit and predict using the models if clf_name == 'CollaborativeFilter': # Split the training data into X and y vectors #Y_train = rebalance_cases(Y_train) X_train = Y_train[:,:-1] Y_train = Y_train[:,-1] X_test = Y_test[:,:-1] Y_test = Y_test[:,-1] m_test_ind = m_ind[test_index] m_test_inds.append(m_test_ind) if optimize_hyperparams: # Run Parameter Search n_iter_search = 2 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring='average_precision', n_jobs=3, refit=True, cv=4, verbose=1 ) start = time() random_search.fit(X_train,Y_train) print("RandomizedSearchCV took %.2f minutes for %d candidates" " parameter settings." % ((time() - start)/60.0, n_iter_search)) opt_report(random_search.grid_scores_,n_top=10) Y_hat=random_search.best_estimator_.predict_proba(X_test) else: clf.set_params(opt_param_dist) clf.fit(X_train,Y_train) Y_hat=clf.predict_proba(X_test) else: X_train, X_test = (features[train_index],features[test_index]) ovr = OneVsRestClassifier(clf) if optimize_hyperparams: n_iter_search = 400 random_search = RandomizedSearchCV(ovr, param_distributions=param_dist, n_iter=n_iter_search, # Average precisions scoring # doesn't seem to work in # multi-label case #scoring='average_precision', scoring='log_loss', n_jobs=3, refit=True, cv=4, verbose=1 ) start = time() random_search.fit(X_train,Y_train) print("RandomizedSearchCV took %.2f minutes for %d candidates" " parameter settings." % ((time() - start)/60.0, n_iter_search)) opt_report(random_search.grid_scores_,n_top=10) #clf = random_search.best_estimator_ Y_hat=random_search.best_estimator_.predict_proba(X_test) else: ovr.set_params(**opt_param_dist) ovr.fit(X_train,Y_train) Y_hat = ovr.predict_proba(X_test) #Y_hat = clf.predict_proba(X_test) # Collect the results Y_pred.append(Y_hat) Y_true.append(Y_test) Y_true=np.vstack(Y_true) Y_pred=np.vstack(Y_pred) # Now do the overall AUC scoring print 'Generating bootstrap samples...' A=np.vstack([Y_true.flatten(),Y_pred.flatten()]) A=A.transpose() auc_scores=[] for j in range(1000): B=resample(A) auc_scores.append(average_precision_score(B[:,0], B[:,1])) auc_scores=np.array(auc_scores) # Now just test PR on the k least popular methods if re.search('CollaborativeFilter',clf_name): m_test_inds=np.vstack(m_test_inds) m_test_ind = m_test_inds.flatten() ix=np.in1d(m_test_ind.ravel(), bottom_inds).reshape(m_test_ind.shape) A = np.vstack([Y_true.flatten()[ix],Y_pred.flatten()[ix]]) else: A=np.vstack([Y_true[:,bottom_inds].flatten(),Y_pred[:,bottom_inds].flatten()]) A=A.transpose() bottom_k_auc_scores=[] for j in range(1000): B=resample(A) bottom_k_auc_scores.append(average_precision_score(B[:,0], B[:,1])) bottom_k_auc_scores=np.array(bottom_k_auc_scores) return Y_pred,Y_true, auc_scores, bottom_k_auc_scores
class MeshTfidfSVM: def __init__( self, y_batch_size=256, nb_labels=None, model_path=None, threshold=0.5 ): """ y_batch_size: int, default 256. Size of column batches for Y i.e. tags that each classifier will train on nb_labels: int, default None. Number of tags that will be trained. model_path: path, default None. Model path being used to save intermediate classifiers threshold: float, default 0.5. Threshold probability on top of which a tag is assigned Note that model_path needs to be provided as it is used to save intermediate classifiers trained to reduce memory usage. """ self.y_batch_size = y_batch_size self.model_path = model_path self.nb_labels = None self.threshold = threshold def _init_vectorizer(self): self.vectorizer = TfidfVectorizer( stop_words="english", max_df=0.95, min_df=5, ngram_range=(1, 1) ) def _init_classifier(self): self.classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2")) def set_params(self, **params): if not hasattr(self, "vectorizer"): self._init_vectorizer() if not hasattr(self, "classifier"): self._init_classifier() tfidf_params = get_params_for_component(params, "tfidf") svm_params = get_params_for_component(params, "svm") self.vectorizer.set_params(**tfidf_params) self.classifier.set_params(**svm_params) # TODO: Create function that checks in params for arguments available in init if "model_path" in params: self.model_path = params["model_path"] if "y_batch_size" in params: self.y_batch_size = params["y_batch_size"] if "nb_labels" in params: self.nb_labels = params["nb_labels"] def fit(self, X, Y): """ X: list of texts Y: sparse csr_matrix of tags assigned """ if not hasattr(self, "vectorizer"): self._init_vectorizer() if not hasattr(self, "classifier"): self._init_classifier() # TODO: Currently Y is expected to be sparse, otherwise predict does not # work, add a check and warn user. print(f"Creating {self.model_path}") Path(self.model_path).mkdir(exist_ok=True) print("Fitting vectorizer") self.vectorizer.fit(X) with open(f"{self.model_path}/vectorizer.pkl", "wb") as f: f.write(pickle.dumps(self.vectorizer)) print("Training model") self.nb_labels = Y.shape[1] for tag_i in range(0, self.nb_labels, self.y_batch_size): print(tag_i) X_vec = self.vectorizer.transform(X) self.classifier.fit(X_vec, Y[:, tag_i : tag_i + self.y_batch_size]) # TODO: Sparsify weights before saving with open(f"{self.model_path}/{tag_i}.pkl", "wb") as f: f.write(pickle.dumps(self.classifier)) return self def predict(self, X): return self.predict_proba(X) > self.threshold def predict_proba(self, X): Y_pred_proba = [] for tag_i in range(0, self.nb_labels, self.y_batch_size): with open(f"{self.model_path}/{tag_i}.pkl", "rb") as f: classifier = pickle.loads(f.read()) X_vec = self.vectorizer.transform(X) Y_pred_proba_batch = classifier.predict_proba(X_vec) Y_pred_proba.append(Y_pred_proba_batch) Y_pred_proba = np.hstack(Y_pred_proba) return Y_pred_proba def save(self, model_path): if model_path != self.model_path: print( f"{model_path} is different from self.model_path {self.model_path}. This will result in model and meta.json be saved in different paths" ) meta = { "name": "MeshTfidfSVM", "approach": "mesh-tfidf-svm", "y_batch_size": self.y_batch_size, "nb_labels": self.nb_labels, } meta_path = os.path.join(model_path, "meta.json") with open(meta_path, "w") as f: f.write(json.dumps(meta)) def load(self, model_path): vectorizer_path = os.path.join(model_path, "vectorizer.pkl") with open(vectorizer_path, "rb") as f: self.vectorizer = pickle.loads(f.read()) meta_path = os.path.join(model_path, "meta.json") with open(meta_path, "r") as f: meta = json.loads(f.read()) self.set_params(**meta) self.model_path = model_path