def train_logreg(trX, trY, vaX=None, vaY=None, teX=None, teY=None, penalty='l1', max_iter=100, C=2**np.arange(-8, 1).astype(np.float), seed=42, model=None, eval_test=True, neurons=None, drop_neurons=False, report_metric='acc', automatic_thresholding=False, threshold_metric='acc', micro=False): # if only integer is provided for C make it iterable so we can loop over if not isinstance(C, collections.Iterable): C = list([C]) # extract features for given neuron indices if neurons is not None: if drop_neurons: all_neurons = set(list(range(trX.shape[-1]))) neurons = set(list(neurons)) neurons = list(all_neurons - neurons) trX = trX[:, neurons] if vaX is not None: vaX = vaX[:, neurons] if teX is not None: teX = teX[:, neurons] # Cross validation over C n_classes = 1 if len(trY.shape) > 1: n_classes = trY.shape[-1] scores = [] if model is None: for i, c in enumerate(C): if n_classes <= 1: model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=seed) model.fit(trX, trY) blank_info_dict = { 'fp': 0, 'tp': 0, 'fn': 0, 'tn': 0, 'std': 0., 'metric': threshold_metric, 'micro': micro } if vaX is not None: info_dict = update_info_dict( blank_info_dict.copy(), vaY, model.predict_proba(vaX)[:, -1]) else: info_dict = update_info_dict( blank_info_dict.copy(), trY, model.predict_proba(trX)[:, -1]) scores.append(get_metric(info_dict)) print(scores[-1]) del model else: info_dicts = [] model = [] for cls in range(n_classes): _model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=seed) _model.fit(trX, trY[:, cls]) blank_info_dict = { 'fp': 0, 'tp': 0, 'fn': 0, 'tn': 0, 'std': 0., 'metric': threshold_metric, 'micro': micro } if vaX is not None: info_dict = update_info_dict( blank_info_dict.copy(), vaY[:, cls], _model.predict_proba(vaX)[:, -1]) else: info_dict = update_info_dict( blank_info_dict.copy(), trY[:, cls], _model.predict_proba(trX)[:, -1]) info_dicts.append(info_dict) model.append(_model) scores.append(get_metric(info_dicts)) print(scores[-1]) del model c = C[np.argmax(scores)] if n_classes <= 1: model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=seed) model.fit(trX, trY) else: model = [] for cls in range(n_classes): _model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=seed) _model.fit(trX, trY[:, cls]) model.append(_model) else: c = model.C # predict probabilities and get accuracy of regression model on train, val, test as appropriate # also get number of regression weights that are not zero. (number of features used for modeling) scores = [] if n_classes == 1: nnotzero = np.sum(model.coef_ != 0) preds = model.predict_proba(trX)[:, -1] train_score = get_metric( update_info_dict(blank_info_dict.copy(), trY, preds), report_metric) else: nnotzero = 0 preds = [] info_dicts = [] for cls in range(n_classes): nnotzero += np.sum(model[cls].coef_ != 0) _preds = model[cls].predict_proba(trX)[:, -1] info_dicts.append( update_info_dict(blank_info_dict.copy(), trY[:, cls], _preds)) preds.append(_preds) nnotzero /= n_classes train_score = get_metric(info_dicts, report_metric) preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1) scores.append(train_score * 100) if vaX is None: eval_data = trX eval_labels = trY val_score = train_score else: eval_data = vaX eval_labels = vaY if n_classes == 1: preds = model.predict_proba(vaX)[:, -1] val_score = get_metric( update_info_dict(blank_info_dict.copy(), vaY, preds), report_metric) else: preds = [] info_dicts = [] for cls in range(n_classes): _preds = model[cls].predict_proba(vaX)[:, -1] info_dicts.append( update_info_dict(blank_info_dict.copy(), vaY[:, cls], _preds)) preds.append(_preds) val_score = get_metric(info_dicts, report_metric) preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1) val_preds = preds val_labels = eval_labels scores.append(val_score * 100) eval_score = val_score threshold = np.array([.5] * n_classes) if automatic_thresholding: _, threshold, _, _ = _binary_threshold( preds.reshape(-1, n_classes), eval_labels.reshape(-1, n_classes), threshold_metric, micro) threshold = float(threshold.squeeze()) if teX is not None and teY is not None and eval_test: eval_data = teX eval_labels = teY if n_classes == 1: preds = model.predict_proba(eval_data)[:, -1] else: preds = [] for cls in range(n_classes): _preds = model[cls].predict_proba(eval_data)[:, -1] preds.append(_preds) preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1) if n_classes == 1: threshold = float(threshold.squeeze()) eval_score = get_metric( update_info_dict(blank_info_dict.copy(), eval_labels, preds, threshold=threshold), report_metric) else: info_dicts = [] for cls in range(n_classes): info_dicts.append( update_info_dict(blank_info_dict.copy(), eval_labels[:, cls], preds[:, cls], threshold=threshold[cls])) eval_score = get_metric(info_dicts, report_metric) scores.append(eval_score * 100) return model, scores, preds, c, nnotzero
#dense_features = ngramTrain.toarray() #dense_test = test_features.toarray() TfidfAccuracy = [] TfidfModel = [] for classifier in randmClassifiers: for x in range(0, 10): random.seed(x) fit = classifier.fit(TfidfTrain, train['Label']) pred = fit.predict(TfidfTest) prob = fit.predict_proba(TfidfTest)[:, 1] accuracy = metrics.accuracy_score(pred, test['Label']) TfidfAccuracy.append(accuracy) TfidfModel.append(classifier.__class__.__name__) print('Accuracy of ' + classifier.__class__.__name__ + ' is ' + str(accuracy)) rndmModelPerf = pd.DataFrame({'Model': TfidfModel, 'Acc': TfidfAccuracy}) # hashing vectorizer - tokenizes by createing a correlational matrix hashingVectorizer = HashingVectorizer(n_features=50) hashingTrain = hashingVectorizer.fit_transform(trainHeadlines) hashingModel = LogisticRegression(solver='lbfgs') hashingModel = hashingModel.fit(hashingTrain, train["Label"]) hashingTest = hashingVectorizer.transform(testHeadlines) hashingPreds = hashingModel.predict(hashingTest)