def __init__(self, w, fair_feature, X_train, y_train): self.w = w self.fair_feature = fair_feature clf0 = LogisticRegression(random_state=0).fit( X_train.loc[:, X_train.columns != fair_feature], y_train) clf0.coef_ = w[0].value[1:].T clf0.intercept_ = w[0].value[0][0] self.clf0 = clf0 clf1 = LogisticRegression(random_state=0).fit( X_train.loc[:, X_train.columns != fair_feature], y_train) clf1.coef_ = w[1].value[1:].T clf1.intercept_ = w[1].value[0][0] self.clf1 = clf1
def train_clf(self, X, idxss, rs): N = sum(len(idx) for idx in idxss) n_epochs = self.compute_epochs(N) if self.optimization == 'fastxml': penalty = 'l1' else: penalty = 'l2' X_train, y_train = self.build_XY(X, idxss, rs) in_liblinear = X_train.shape[0] > (self.auto_weight * self.max_leaf_size) if self.engine == 'liblinear' or (self.engine == 'auto' and in_liblinear): if self.loss == 'log': # No control over penalty clf = LogisticRegression(solver='liblinear', random_state=rs, tol=1, C=self.C, penalty=penalty) else: clf = LinearSVC(C=self.C, fit_intercept=self.bias, max_iter=n_epochs, class_weight='balanced', penalty=penalty, random_state=rs) else: clf = SGDClassifier(loss=self.loss, penalty=penalty, n_iter=n_epochs, alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced', random_state=rs) clf.fit(X_train, y_train) # Halves the memory requirement clf.coef_ = sparsify(clf.coef_, self.eps) if self.bias: clf.intercept_ = clf.intercept_.astype('float32') return clf, CLF(clf.coef_, clf.intercept_)
def test_binary_predicting(self): """Test binary softmax classifier.""" target_num = 2 (W, b), (X, y) = self.make_lr_data(target_num=target_num, dtype=glue.config.floatX) # When target_num == 2, LogisticRegression from scikit-learn uses sigmoid, # so does our LogisticRegression implementation. lr = LogisticRegression().fit(X, y) lr.coef_ = W.T lr.intercept_ = b self.assertTrue(np.alltrue(lr.predict(X) == y)) graph = G.Graph() with graph.as_default(): input_var = G.make_placeholder('inputs', shape=(None, W.shape[0]), dtype=glue.config.floatX) input_layer = G.layers.InputLayer(input_var, shape=(None, W.shape[0])) lr2 = models.LogisticRegression('logistic', input_layer, target_num=target_num, W=W, b=b) predict_prob = G.layers.get_output(lr2) predict_label = G.op.argmax(predict_prob, axis=1) predict_fn = G.make_function(inputs=[input_var], outputs=[predict_prob, predict_label]) with G.Session(graph): prob, predict = predict_fn(X) self.assertTrue(np.alltrue(predict == y)) err = np.max(abs(lr.predict_proba(X) - prob)) self.assertLess(err, 1e-5)
def search_specificity(s, y, z, return_score=False, verbose=None): """Find specificity search ranking.""" logit = LogisticRegression() rank_s = np.zeros(len(s), dtype=np.int) r_s = np.zeros((len(s), len(s))) if verbose is not None: pbar = ProgressBar(widgets=['Specificity search: ', SimpleProgress()], maxval=len(s)).start() for query_idx in range(len(s)): if verbose is not None: pbar.update(query_idx + 1) for ref_idx, (this_y, this_z) in enumerate(zip(y, z)): logit.intercept_ = np.array([this_y]) logit.coef_ = np.array([[this_z]]) r_s[query_idx, ref_idx] = logit.predict_proba(s[query_idx, ref_idx])[0][1] r_s[np.isnan(r_s)] = -np.inf idx_s = _matlab_sort(r_s[query_idx, :]) # make matlab equiv. by adding 1 rank_s[query_idx] = np.where(idx_s == query_idx)[0][0] + 1 if verbose is not None: pbar.finish() if return_score: return rank_s, r_s else: return rank_s
def test_add_loss_output_cls(self): from onnxcustom.utils.orttraining_helper import add_loss_output X, y = make_classification( # pylint: disable=W0632 100, n_features=10) X = X.astype(numpy.float32) y = y.astype(numpy.int64) X_train, X_test, y_train, y_test = train_test_split(X, y) reg = LogisticRegression() reg.fit(X_train, y_train) reg.coef_ = reg.coef_.reshape((1, -1)) onx = to_onnx(reg, X_train, target_opset=opset, black_op={'LinearClassifier'}, options={'zipmap': False}) onx_loss = add_loss_output(onx, 'log', output_index='probabilities', eps=1 - 6) try: text = onnx_simple_text_plot(onx_loss) except RuntimeError: text = "" if text: self.assertIn("Clip(probabilities", text) oinf = OnnxInference(onx_loss) output = oinf.run({'X': X_test, 'label': y_test.reshape((-1, 1))}) loss = output['loss'] skl_loss = log_loss(y_test, reg.predict_proba(X_test), eps=1 - 6) self.assertLess(numpy.abs(skl_loss - loss[0, 0]), 1e-5)
def copy_logistic_model(model: LogisticRegression, max_iter=10e5, penalty='none') -> LogisticRegression: copied_model = LogisticRegression(max_iter=max_iter, penalty=penalty) copied_model.coef_ = model.coef_.copy() copied_model.classes_ = model.classes_.copy() copied_model.intercept_ = model.intercept_ return copied_model
def set_model_params( model: LogisticRegression, params: LogRegParams ) -> LogisticRegression: """Sets the parameters of a sklean LogisticRegression model.""" model.coef_ = params[0] if model.fit_intercept: model.intercept_ = params[1] return model
def get_fitted_lr(): coef_ = np.loadtxt(project_path / "output/news/lr_coef.txt") intercept_ = np.loadtxt(project_path / "output/news/lr_intercept.txt") classes_ = np.loadtxt(project_path / "output/news/lr_classes.txt") lr = LogisticRegression() lr.coef_, lr.intercept_, lr.classes_ = coef_, intercept_, classes_ return lr
def deserialize_logistic_regression(model_dict): model = LogisticRegression(model_dict["params"]) model.classes_ = np.array(model_dict["classes_"]) model.coef_ = np.array(model_dict["coef_"]) model.intercept_ = np.array(model_dict["intercept_"]) model.n_iter_ = np.array(model_dict["intercept_"]) return model
def deserialize_logistic_regression(model_dict): model = LogisticRegression(model_dict['params']) model.classes_ = np.array(model_dict['classes_']) model.coef_ = np.array(model_dict['coef_']) model.intercept_ = np.array(model_dict['intercept_']) model.n_iter_ = np.array(model_dict['intercept_']) return model
def choose(self, pvalues, method, outcome): ''' Randomly choose state of node from probability distribution conditioned on *pvalues*. This method has two parts: (1) determining the proper probability distribution, and (2) using that probability distribution to determine an outcome. Arguments: 1. *pvalues* -- An array containing the assigned states of the node's parents. This must be in the same order as the parents appear in ``self.Vdataentry['parents']``. The function creates a Gaussian distribution in the manner described in :doc:`lgbayesiannetwork`, and samples from that distribution, returning its outcome. ''' warnings.filterwarnings("ignore", category=FutureWarning) rand = random.random() dispvals = [] lgpvals = [] for pval in pvalues: if (isinstance(pval, str)): dispvals.append(pval) else: lgpvals.append(pval) # find correct Gaussian lgdistribution = self.Vdataentry["hybcprob"][str(dispvals)] for pvalue in lgpvals: assert pvalue != 'default', "Graph skeleton was not topologically ordered." model = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=100) model.classes_ = np.array(lgdistribution["classes"], dtype=object) if len(lgdistribution["classes"]) > 1: model.coef_ = np.array(lgdistribution["mean_scal"], dtype=float).reshape(-1, len(lgpvals)) model.intercept_ = np.array(lgdistribution["mean_base"], dtype=float) distribution = model.predict_proba( np.array(lgpvals).reshape(1, -1))[0] # choose rand = random.random() lbound = 0 ubound = 0 for interval in range(len(lgdistribution["classes"])): ubound += distribution[interval] if (lbound <= rand and rand < ubound): rindex = interval break else: lbound = ubound return str(lgdistribution["classes"][rindex]) else: return str(lgdistribution["classes"][0])
def changing_the_model_coefficients(X, y): model = LogisticRegression() model.fit(X, y) model.coef_ = np.array([[-1, 1]]) model.intercept_ = np.array([-3]) util.plot_classifier(X, y, model) num_err = np.sum(y != model.predict(X)) print("Number of errors:", num_err)
def test(self, weights, intercepts): X_test = pd.DataFrame(self.df, columns=self.features) y_test = pd.DataFrame(self.df, columns=self.label) y_test = y_test.values.ravel() lr = LogisticRegression() lr.fit(X_test, y_test) lr.coef_ = weights # override weights and coefficients lr.intercept_ = intercepts return lr.score(X_test, y_test)
def LogR_predict(): X = json.loads(request.form['X']) params = json.loads(request.form['params']) reg = LogisticRegression() reg.coef_ = np.array(params['coef']) reg.intercept_ = np.array(params['inter']) reg.n_iter_ = np.array(params['niter']) reg.classes_ = np.array(params['classes']) y = reg.predict(X) return jsonify(pred=y.tolist())
def examineModel(X_train, y_train): model = util.openPkl("../models/avg_logistic_model") avg_coefficients = model['coeff_'] avg_intercepts = model['intercept_'] clf = LogisticRegression() clf.coef_ = avg_coefficients clf.intercept_ = avg_intercepts clf.classes_ = util.classes print("Averaged model train accuracy:", clf.score(X_train, y_train)) avg_preds = clf.predict(X_train)
def resample_voting_weights(self, loc, scale): """ Samples voting weights for the whole ensemble from a normal distribution. :type loc: numpy.ndarray :param loc: the mean of the normal distribution. :type scale: float :param scale: the std deviation of the normal distribution. :rtype: Ensemble :return: returns self. """ self.logistic_model = [] all_preds = self.get_predictions(self.X_train) for j in range(self.n_classifiers): for c in range(self.n_classes): self.voting_weights[j][c] = np.clip(np.random.normal( loc=loc[j][c], scale=scale), a_min=0., a_max=1.) for i, that_class in enumerate(self.classes): if self.n_classes == 2: logistic_regression = LogisticRegression() logistic_regression.fit(all_preds.T, self.y_train) logistic_regression.coef_ = self.voting_weights[:, i].reshape( 1, self.n_classifiers) self.logistic_model += [logistic_regression] break else: binary_preds = (all_preds == that_class).astype(np.int32) logistic_regression = LogisticRegression() logistic_regression.fit(binary_preds.T, self.y_train == that_class) logistic_regression.coef_ = self.voting_weights[:, i].reshape( 1, self.n_classifiers) self.logistic_model += [logistic_regression] return self
def test(user, file): import soundfile as sf outAddr = os.path.join(WORKING_DIRECTORY + '/test', file + '.npz') inAddr = os.path.join(WORKING_DIRECTORY + '/files/' + user, file + '.flac') data, sampleRate = sf.read(inAddr) extract_features(sampleRate, data, outAddr, True) coFile = os.path.join(WORKING_DIRECTORY + '/coefficients', user + '.npy') coefficients = np.load(coFile) testSet = np.load(outAddr) lr = LogisticRegression() lr.coef_ = coefficients print(lr.predict(testSet))
def test_sklearn_train_lr_into_c(self): from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_iris iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 lr = LogisticRegression() lr.fit(X, y) # We replace by double too big for floats. lr.coef_ = numpy.array([[2.45, -3e250]]) self.assertRaise(lambda: sklearn2graph( lr, output_names=['Prediction', 'Score']), Float32InfError)
def pbubble(mlscore, hr): #Parameters for Logistic fit determined in Expert_votes.ipynb clf = LogisticRegression() clf.intercept_ = np.array([ 1.868260]) clf.coef_ = np.array([[1.539796]]) # zsacling and combining. see Expert_votes.ipynb s = (hr - .2143) / .11379 + (mlscore - .10044) / .5893 good = np.isfinite(s) result = np.zeros_like(mlscore) * np.nan result[good] = clf.predict_proba(s[good].reshape(-1, 1))[:, 1] return result
def predict_x_event_new(dname, ts, filters, B_coef, B_bias): thres = 0.99 # might change lvls = [0.2, 0.8, 0.95] res_y_hat = [] res_y_bin = [] res_y_cat = [] all_y = [] for i in range(0, len(ts)): t, X, y = make_dataset_fast(dname, [ts[i]], filters) dum = np.array(y) if len(np.unique(dum)) == 1: print("Time series ", ts[i], " has no events") continue # load model (new) model = LogisticRegression(max_iter=1).fit(X, y) model.coef_ = B_coef model.intercept_ = B_bias y_hat = model.predict_proba(X) y_hat = y_hat[:, 1] y_bin = y_hat < thres y_cat = np.zeros(len(y_hat)) y_cat[(y_hat > lvls[0]) & (y_hat < lvls[1])] = 1 y_cat[(y_hat > lvls[1]) & (y_hat < lvls[2])] = 2 y_cat[y_hat > lvls[2]] = 3 res_y_hat.extend(y_hat) res_y_bin.extend(y_bin) res_y_cat.extend(y_cat) all_y.extend(y) fid = open(dname + ts[i] + 'minotaur_prediction.dat', 'w') for ii in range(len(y)): fid.write('%.4f, %.4f, %.4f, %.4f, %.4f\n' % (t[ii], y[ii], y_hat[ii], y_bin[ii], y_cat[ii])) fid.close() z1, z2, _ = roc_curve(all_y, res_y_hat, pos_label=1) auc = roc_auc_score(all_y, res_y_hat) res_auc = auc res_fpr = z1 # false positive rate (false alarm) res_tpr = z2 # true positive rate (hit) return res_auc, res_fpr, res_tpr, res_y_hat, res_y_bin, res_y_cat
def set_initial_params(model: LogisticRegression): """Sets initial parameters as zeros Required since model params are uninitialized until model.fit is called. But server asks for initial parameters from clients at launch. Refer to sklearn.linear_model.LogisticRegression documentation for more information. """ n_classes = 10 # MNIST has 10 classes n_features = 784 # Number of features in dataset model.classes_ = np.array([i for i in range(10)]) model.coef_ = np.zeros((n_classes, n_features)) if model.fit_intercept: model.intercept_ = np.zeros((n_classes,))
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # These attributes do not exist in older adeft models. # For backwards compatibility we check if they are present if 'stats' in model_info: longform_model.stats = model_info['stats'] if 'std' in model_info: longform_model._std = np.array(model_info['std']) if 'timestamp' in model_info: longform_model.timestamp = model_info['timestamp'] if 'training_set_digest' in model_info: longform_model.training_set_digest = model_info['training_set_digest'] if 'params' in model_info: longform_model.params = model_info['params'] if 'version' in model_info: longform_model.version == model_info['version'] if 'confusion_info' in model_info: longform_model.confusion_info = model_info['confusion_info'] if 'other_metadata' in model_info: longform_model.other_metadata = model_info['other_metadata'] return longform_model
def build_model_from_factors(factors, intercept, y, X): # "factors" has all the values such as is_enabled, is_binary, is_balanced # for example: # print(factors[0]["alias"]) # print(factors[0]["is_enabled"]) # print(factors[0]["weight"]) ## assert intercept is not None, "intercept is None" coefficient_list = preparelist(factors, X.columns, intercept) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.25, random_state=42) model = LogisticRegression() model.fit(X_train, y_train) # It would be nice if we didn't have to do this model.coef_ = np.array([coefficient_list]) return model
def predict_x_event(dataset_t, dataset_X, dataset_y, B_coef, B_bias, fname_out): ''' Performs prediction for the data inside the dataset structure and computes the AUC performance of the predictive model. ''' X = dataset_X y = dataset_y t = dataset_t thres = 0.99 # might change lvls = [0.2, 0.8, 0.95] # load model (new) model = LogisticRegression(max_iter=1).fit(X, y) #model.classes_ = [1, 2] model.coef_ = B_coef model.intercept_ = B_bias y_hat = model.predict_proba(X) y_hat = y_hat[:, 1] res_y_hat = y_hat z1, z2, _ = roc_curve(y, y_hat, pos_label=1) auc = roc_auc_score(y, y_hat) res_auc = auc res_fpr = z1 # false positive rate (false alarm) res_tpr = z2 # true positive rate (hit) res_y_bin = y_hat < thres y_cat = np.zeros(len(y_hat)) y_cat[(y_hat > lvls[0]) & (y_hat < lvls[1])] = 1 y_cat[(y_hat > lvls[1]) & (y_hat < lvls[2])] = 2 y_cat[y_hat > lvls[2]] = 3 res_y_cat = y_cat # save fid = open(fname_out, 'w') for ii in range(len(y)): fid.write('%.4f, %.4f, %.4f, %.4f, %.4f\n' % (t[ii], y[ii], y_hat[ii], res_y_bin[ii], y_cat[ii])) fid.close() return res_auc, res_fpr, res_tpr, res_y_hat, res_y_bin, res_y_cat
def fed_integrate_model_lr(model1, model2): coef_1 = model1.coef_ coef_2 = model2.coef_ intercept_1 = model1.intercept_ intercept_2 = model2.intercept_ classes = model1.classes_ model = LogisticRegression(solver='sag') model.coef_ = mulkeys_add_2Darray(model1.coef_, model2.coef_)/2 model.intercept_ = mulkeys_add_array( model1.intercept_, model1.intercept_)/2 model.classes_ = classes return model
def binary_logi(base_data, oneshot_data, param): tr = [] tr_label = [] for data in base_data: tr.append(data) tr_label.append(0.0) for data in oneshot_data: tr.append(data) tr_label.append(1.0) tr, tr_label = shuffle(tr, tr_label) clf = LogisticRegression(warm_start=True) print param if param is not None: clf.coef_ = param[:-1] clf.intercept_ = param[-1] clf.fit(tr, tr_label) return np.c_[clf.coef_, clf.intercept_]
def evaluate_sample(sample, central_text): train_columns = [ 'CANDIDATE_CONTAINS_DIGITS', 'CANDIDATE_CONTAINS_PUNCTUATION', 'CANDIDATE_IS_STOPWORD', 'CHAR_DISTANCE', 'CHAR_DISTANCE_/_LEN_CAND_TEXT', 'CHAR_DISTANCE_/_LEN_ORIG_TEXT', 'LEN_CAND_PRONOUNCE', 'LEN_ORIG_PRONOUNCE', 'MEAN_CAND_PHRASE_FREQ_IN_CLIENT_VOCAB', 'MEAN_CAND_PHRASE_TFIDF_IN_CLIENT_VOCAB', 'MEAN_FREQUENCY_OF_CAND_WORD_BY_WORDFREQ', 'MEAN_FREQUENCY_OF_ORIG_WORD_BY_WORDFREQ', 'MEAN_ORIG_PHRASE_FREQ_IN_CLIENT_VOCAB', 'MEAN_ORIG_PHRASE_TFIDF_IN_CLIENT_VOCAB', 'ORIG_PHRASE_IS_STOPWORD', 'PHONEME_DISTANCE', 'PHONEME_DISTANCE_/_LEN_CAND_PRONOUNCE', 'PHONEME_DISTANCE_/_LEN_ORIG_PRONOUNCE' ] non_feature_columns = [ 'target', 'badly_recognized_text', 'candidate_text', 'context', ] WEIGHTS = np.array([[ 0.00000000e+00, 1.19336941e+00, -9.37956263e-01, -9.10860368e-01, -4.40045761e-01, -4.56662605e-02, 2.95188685e-01, 1.39309686e-01, -2.00982855e-02, 8.42150367e-01, -1.83510718e-02, 1.41584803e-02, -2.58391711e-02, -1.73674143e-01, 3.30475285e-01, 1.14180462e-01, 4.28573585e-04, 3.97560332e-04 ]]) BIAS = np.array([-1.24675658]) X_test = sample[train_columns].to_numpy() clf = LogisticRegression() clf.coef_ = WEIGHTS clf.intercept_ = BIAS ts_preds = clf.predict_proba(X_test)[:, 1] df_to_display = sample[non_feature_columns].copy() df_to_display['score'] = ts_preds df_to_display = df_to_display.head(50).sort_values(by='score', ascending=False) items = [] for i in range(df_to_display.shape[0]): items.append((df_to_display.candidate_text.iloc[i], df_to_display.score.iloc[i])) items = list(set(items)) items.sort(key=lambda x: x[1], reverse=True) return items
def convert(self, model_dict): param_obj = model_dict["HomoLogisticRegressionParam"] meta_obj = model_dict["HomoLogisticRegressionMeta"] sk_lr_model = LogisticRegression(penalty=meta_obj.penalty.lower(), tol=meta_obj.tol, fit_intercept=meta_obj.fit_intercept, max_iter=meta_obj.max_iter) coefficient = np.empty((1, len(param_obj.header))) for index in range(len(param_obj.header)): coefficient[0][index] = param_obj.weight[param_obj.header[index]] sk_lr_model.coef_ = coefficient sk_lr_model.intercept_ = np.array([param_obj.intercept]) # hard-coded 0-1 classification as HomoLR only supports this for now sk_lr_model.classes_ = np.array([0., 1.]) sk_lr_model.n_iter_ = [param_obj.iters] return sk_lr_model
def runClassification(spend, marital, hh_size, income): # define the model model = LogisticRegression(max_iter=1000) # dummy training to initialize weights and biases of the model model.fit(np.array([[0, 0, 0, 0], [1, 1, 1, 1]]), [0, 1]) # Assigning trained weights and biases to the model model.coef_ = np.array( [[0.01418876, -0.34609983, 0.14408751, 0.02292672]]) # weights model.bias_ = np.array([-0.3975337]) # bias example_instance = np.array( [[spend, marital, hh_size, income]]) # [[TOTAL_SPEND, MARITAL, HH_SIZE, INCOME_RANGE]] # Test the model prediction = model.predict(example_instance) return prediction[0].item()
def get_density_score(self, file): preprocessor = Preprocessor(width=256, height=224, interpolation=3) dicom = preprocessor.load_dicom(file) manufacturer = DicomManager.get_manufacturer(dicom) normalization_config = self.GE_MD_NORMALIZER_CONFIG if manufacturer == 'GE' else self.HO_MD_NORMALIZER_CONFIG normalizer = CLAHENormalizer(**normalization_config) lr = LogisticRegression() coef = self.GE_DR_COEF if manufacturer == 'GE' else self.HO_DR_COEF intercept = self.GE_DR_INT if manufacturer == 'GE' else self.HO_DR_INT lr.coef_ = np.load(coef) lr.intercept_ = np.load(intercept) model = DenseRisk(preprocessor=preprocessor, normalizer=normalizer) model.risk_model = lr model.num_feature = self.DR_NUM_FEATURES model.scaler = self.GE_DR_SCALER if manufacturer == 'GE' else self.HO_DR_SCALER return model.get_density_score(file)
def load_model(serialization_dir): with open(os.path.join(args.model, "best_hyperparameters.json"), 'r') as f: hyperparameters = json.load(f) if hyperparameters.pop('stopwords') == 1: stop_words = 'english' else: stop_words = None weight = hyperparameters.pop('weight') if weight == 'binary': binary = True else: binary = False ngram_range = hyperparameters.pop('ngram_range') ngram_range = sorted([int(x) for x in ngram_range.split()]) if weight == 'tf-idf': vect = TfidfVectorizer(stop_words=stop_words, lowercase=True, ngram_range=ngram_range) else: vect = CountVectorizer(binary=binary, stop_words=stop_words, lowercase=True, ngram_range=ngram_range) with open(os.path.join(args.model, "vocab.json"), 'r') as f: vocab = json.load(f) vect.vocabulary_ = vocab hyperparameters['C'] = float(hyperparameters['C']) hyperparameters['tol'] = float(hyperparameters['tol']) classifier = LogisticRegression(**hyperparameters) if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")): vect.idf_ = np.load( os.path.join(serialization_dir, "archive", "idf.npy")) classifier.coef_ = np.load( os.path.join(serialization_dir, "archive", "coef.npy")) classifier.intercept_ = np.load( os.path.join(serialization_dir, "archive", "intercept.npy")) classifier.classes_ = np.load( os.path.join(serialization_dir, "archive", "classes.npy")) return classifier, vect
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # Load model statistics if they are available if 'stats' in model_info: longform_model.stats = model_info['stats'] # Load standard deviations for calculating feature importances # if they are available if 'std' in model_info: longform_model._std = np.array(model_info['std']) return longform_model
def test_categorical_predicting(self): """Test categorical softmax classifier.""" target_num = 5 (W, b), (X, y) = self.make_lr_data(target_num=target_num, dtype=glue.config.floatX) lr = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X, y) lr.coef_ = W.T lr.intercept_ = b self.assertTrue(np.alltrue(lr.predict(X) == y)) graph = G.Graph() with graph.as_default(): input_var = G.make_placeholder('inputs', shape=(None, W.shape[0]), dtype=glue.config.floatX) input_layer = G.layers.InputLayer(input_var, shape=(None, W.shape[0])) lr2 = models.LogisticRegression('logistic', input_layer, target_num=target_num, W=W, b=b) predict_prob = G.layers.get_output(lr2) predict_label = G.op.argmax(predict_prob, axis=1) predict_fn = G.make_function(inputs=[input_var], outputs=[predict_prob, predict_label]) with G.Session(graph): prob, predict = predict_fn(X) self.assertTrue(np.alltrue(predict == y)) err = np.max(abs(lr.predict_proba(X) - prob)) self.assertLess(err, 1e-5)
data_cls = np.asarray(cls_all) data_pln = np.asarray(pln_all) # Load GAT model gat = joblib.load(data_path + "decode_time_gen/gat_cp.jl") # Setup data for epochs and cross validation X = np.vstack([data_cls, data_pln]) y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))]) cv = StratifiedKFold(n_splits=7, shuffle=True) perm_score_results = [] for j, est in enumerate(gat.estimators_): for tmp in est: lr_mean = LogisticRegression(C=0.0001) lr_mean.coef_ = np.asarray([lr.coef_ for lr in est]).mean( axis=0).squeeze() lr_mean.intercept_ = np.asarray([lr.intercept_ for lr in est]).mean() score, perm_score, pval = permutation_test_score( lr_mean, X[:, :, j], y, cv=cv, scoring="roc_auc", n_permutations=2000) perm_score_results.append({ "score": score, "perm_score": perm_score, "pval": pval }) joblib.dump(perm_score_results, data_path + "decode_time_gen/perm_score_results_cp.npy")
def logistic_from_weights(weights, intercept): # Rebuild the trained model given the parameters logreg = LogisticRegression() logreg.coef_ = weights logreg.intercept_ = intercept return logreg