def test_save_and_load_big(): gb = setup_big() save(gb, 'test.json') gb2 = read('test.json') assert len(gb2.students) == 50 assert len(gb2.gradeables) == 21 assert len(gb2.scores) == 20 * 50 * 20
def save(self): if self.save_every <= 1: persist.save(UserData.data, self.persist) self.save_every = 3 else: self.save_every -= 1
def test_save_and_load(): gb = setup() save(gb, 'test.json') gb2 = read('test.json') assert len(gb2.students) == 2 assert len(gb2.categories) == 2 assert len(gb2.gradeables) == 3
def predict(self, papers): """ Generates predictions from the trained classifiers. Each binary classifier is applied once. Parameters ========== papers : pd.DataFrame papers that we want to classify. Required column: tokens_baseline - previously tokenized title-abstract Returns ======= scores : pd.DataFrame Dataframe containing the predictions generated by each model. Each column corresponds to a review group and the values in that column are the probabilities that each paper belong to that review group. """ scores = {} tokenized_papers = list(papers[self.tokens_col]) # get embeddings for papers if check_persisted(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y', self.load_fresh): weighted_embeddings = load(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y') else: self.embeddings_model = load_word2vec(self.env['word2vec_model']) vec = self.vectorizer X = vec.transform(tokenized_papers) weighted_embeddings = np.array(self.create_embeddings(X, vec)) save(weighted_embeddings, f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y', persist=True) for model_group in tqdm(self.models, desc='Test Review Groups'): # get the classifier classifier = self.models[model_group] # predictions as probabilities y_preds = classifier.predict_proba(weighted_embeddings) probabilities = y_preds[:, 1] # store scores of model scores[model_group] = probabilities scores = pd.DataFrame.from_dict(scores) return scores
def update_settings(): persist.settings['redId'] = sensors[0].sensor_id persist.settings['blueId'] = sensors[1].sensor_id persist.settings['greenId'] = sensors[2].sensor_id persist.settings['setpointC'] = controller.setpoint persist.settings['logEnabled'] = logging_enabled persist.settings['controlEnabled'] = controller.enabled persist.settings['mode'] = mode persist.settings['sg'] = specific_gravity persist.settings['fermentStart'] = ferment_start_time persist.save(SETTINGS_FILE)
def train_best_models(keys, X_train, y_train, best_models, prod_config, local_paths): """ Train and store model objects for each review group. Parameters ========== X_train : DataFrame Training features y_train : DataFrame Training labels best_models : dict Dictionary where key=review group and value=dictionary with best algorithm and hyperparameters for that group and specified minimum recall value. prod_config : dict Config file for production pipeline. Includes recall values for each group and features to pull into training data. local_paths : dict Local directory paths, used to store models for production. keys: list of key values to subset best models by Returns ======= None """ # subset best models best_models = dict((k, best_models[k]) for k in (keys)) # Loop through review groups and train model on all data #for review_group, params in tqdm(best_models.items(), desc='Training Review Group Production Models'): for review_group, params in best_models.items(): print(f'training {review_group}') classifier = select_classifier( classifier_name=params['algorithm'], fold_hash='prod', target=None, classes=list(prod_config['classes']), model_parts={}, hyperparameters=eval(params['hyperparameters']), seed=prod_config['seed'], citations_cols=prod_config['citations_cols'], env=local_paths, load_fresh=False) classifier.train(X_train, y_train[[review_group.lower()]]) # Store models locally save(object=classifier, location=local_paths['store_production_models'], filename=f"prod_models_{review_group}") print(f'training {review_group} done')
def __call__(self, *args, **kwds): key = (tuple(args), tuple(kwds.items())) h = hash(key) name = '%s/%s_%s.sobj' % (self.__dir, self.__func.__name__, h) if os.path.exists(name): key2, val = persist.load(name) if key == key2: # We save and test equality of keys to avoid # the (extremely remote) possibility of a hash # collision. Correctness is crucial in mathematics. return val val = self.__func(*args, **kwds) persist.save((key, val), name) return val
def __call__(self, *args, **kwds): key = (tuple(args), tuple(kwds.items())) h = hash(key) name = '%s/%s_%s.sobj'%(self.__dir, self.__func.func_name, h) if os.path.exists(name): key2, val = persist.load(name) if key == key2: # We save and test equality of keys to avoid # the (extremely remote) possibility of a hash # collision. Correctness is crucial in mathematics. return val val = self.__func(*args, **kwds) persist.save((key, val), name) return val
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) if evaluate_best_models: # Test best models for each review group scored_papers_test = load(location=local_paths['store_scored_papers'], filename='scored_papers') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) rg_list = [] wrkld_reductions = [] # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] rg_list.append(rg) wrkld_reductions.append(workload_reduction) d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions} df = pd.DataFrame.from_dict(d) plot_average_workload_reduction(df) connection.close() print("Model selection pipeline complete.")
def train(self, x_train, y_train): """ Trains classifier for each review group and stores it in a dictionary that is a class attribute. Parameters ========== x_train : pd.DataFrame Dataframe with columns corresponding to features to include in the model. y_train : pd.DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. Returns ======= None """ ### preprocess ### if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] # train for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # creating parameters for xgboost classifier = AdaBoostClassifier(base_estimator=self.base_estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm, random_state=self.seed).fit( X, labels) # save classifier to class attribute self.models[review_group] = classifier
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X', self.load_fresh): pca = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_pca') vec = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X_tfidf = vec.fit_transform(tokenized_papers) # reduce dimensionality of tf-idf features through PCA pca = TruncatedSVD(n_components=self.n_components, random_state=self.seed) X = pca.fit_transform(X_tfidf) save(pca, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_pca', persist=True) save(vec, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X', persist=True) self.pca = pca self.vectorizer = vec # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
""" yacht control """ import sys import speak import time import persist from menu import * # the following are application specific so should be in the menu from switch import * from barometer import * persist.save(Switch(),'barotalk') persist.save(Switch(),'anchortalk') def visit(item) : action = item.getAttribute('action') if action == "" : text = item.getAttribute('title') else : text = eval(action) speak.say(text) print text name = sys.argv[1] menu = Menu(name) menu.run(visit)
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column corresponding to the tokens to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec if check_persisted(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X', self.load_fresh): weighted_embeddings = load(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X') else: self.embeddings_model = load_word2vec(self.env['word2vec_model']) weighted_embeddings = np.array(self.create_embeddings(X, vec)) save(weighted_embeddings, f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X', persist=True) # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] #initiate random forest model classifier = RandomForestClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, random_state=self.seed, n_jobs=self.n_jobs).fit(weighted_embeddings, labels) # save the model in dictionary of models self.models[review_group] = classifier
def run_pipeline(ignition_file, persist_all, load_all_fresh): """ An adhoc pipeline created to mirror the standard ML pipeline and work with citations data. Parameters: =========== ignition_file: string name of the yaml file for which you want to run an experiment persist_all: boolean T if you want to persist all data for future use load_all_fresh: boolean T if you want to avoid any persisted data and load new data from scrath Returns: ======== None """ model_parts = {} ##### 1. LOAD ENVIRONMENT DATA ##### # load local paths local_paths_env = load_local_paths('local_paths.yaml') print('Local paths loaded.') # load ignition file ignition = load_config(local_paths_env['ignition_path'] + ignition_file) print('Ignition loaded.') # id used for persisting hash_id = create_hash_id(str(ignition['id'])) print('Hash id created.') # create hyperparameter combinations (for k-folding) hyperparameters = expand_grid(ignition['hyperparameters']) # load environment file psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path']) print('PSQL environment file loaded.') # Initiate PSQL Connection connection = SQLConn(psql_env) connection.open() ##### 2. LOAD TRAIN AND TEST DATA ##### if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x', load_all_fresh): print("Found data") # data loaded before: load from file X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x') X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x') y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y') y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y') print('Loaded data from file.') else: print("Data not found in storage - load from database") # data not loaded: pull from database and create features X_train, X_test, y_train, y_test = sample( ignition, connection, local_paths_env['store_features']) print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") # add fold index column to data X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'], ignition['k_folds_seed']) # save data to file for future use save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x', persist_all) save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x', persist_all) save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y', persist_all) save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y', persist_all) print('Data loading completed.') ##### 3. K-FOLDING ##### # loop over folds for fold in tqdm(range(ignition['k_folds']), desc='Folds'): # get fold id hash (for persisting) fold_id = create_hash_id(str(ignition['id']) + str(fold)) # get fold data fold_X_train = X_train[X_train['k'] != fold] fold_X_test = X_train[X_train['k'] == fold] fold_y_train = y_train[y_train['k'] != fold] fold_y_test = y_train[y_train['k'] == fold] # store fold features, if any fold_features = {} ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER ##### for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'): # create hyperparam unique id and hyperparam-fold unique id hyperparam_id = create_hash_id( str(ignition['id']) + str(hyperparam)) hyperparam_fold_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold)) # if not check_val_in_db(connection, ignition['results_table_name'], # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])): # create classifier of specified type and with specified target classifier = select_classifier(ignition["model_type"], fold_id, ignition["target"], ignition["classes"], fold_features, hyperparameters=hyperparam, seed=ignition['seed'], env=local_paths_env, load_fresh=load_all_fresh) #print('Classifier created.') # train classifier classifier.train(fold_X_train, fold_y_train) ##### 5. TEST CLASSIFIER ##### # generate predictions from classifier y_probs = classifier.predict(fold_X_test) ##### 6. EVALUATION ##### for recall in tqdm(ignition['recalls'], desc='Evaluations'): # compute evaluation metrics all_metrics = compute_metrics( metric_names=ignition['metrics'], y_true=fold_y_test.drop(columns=['k']), y_pred=y_probs, k=recall) # store results in database unique_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold) + str(recall)) results_to_db(metrics=all_metrics, table_name=ignition['results_table_name'], ignition_id=ignition['id'], hash_id=hyperparam_fold_id, algorithm=ignition['model_type'], hyperparameters=hyperparam, fold=str(fold), recall=recall, unique_id=unique_id, connection=connection) connection.close() print(f"Done running pipeline for ignition id: {ignition['id']}!")
def save_current(gb): persist.save(gb, gb.file_name()) persist.log_config_warnings(gb)
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column for the token to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') X = hstack( [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X]) else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) X = hstack( [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X]) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # Create data structure for XGBoost data_dmatrix = xgb.DMatrix(data=X, label=labels) # creating parameters for xgboost params = { 'objective': self.objective, 'learning_rate': self.learning_rate, 'max_depth': self.max_depth, 'subsample': self.subsample, 'colsample_bytree': self.colsample_bytree, 'n_estimators': self.n_estimators, 'objective': self.objective, 'gamma': self.gamma, 'alpha': self.l1, 'lambda': self.l2 } # xgboost self.models[review_group] = xgb.train(params, data_dmatrix)
def save_and_exit(gb): persist.save(gb, gb.file_name()) persist.log_config_warnings(gb) menus.m_main.close()
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with at least the column: average_embeddings - average word embeddings for concatenated title and abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if data has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X', self.load_fresh): pca = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_pca') X = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X') else: # convert input to format for classifier list_of_embeddings = list(x_train[self.embeddings_col]) x_train = np.array( [[float(i) for i in embedding.strip('[]').split()] for embedding in list_of_embeddings]) # reduce dimensionality of embeddings through PCA pca = PCA(n_components=self.n_components, random_state=self.seed) X = pca.fit_transform(x_train) save(pca, f"{self.env['store_misc']}/pca", f'{self.fold_hash}_pca', persist=True) save(X, f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X', persist=True) self.pca = pca # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
def train(self, x_train, y_train): """ Trains one classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column for the token to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X]) else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X]) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # Create data structure for light gbm data_dmatrix = lgb.Dataset(data=X, label=labels) # creating parameters for light gbm params = { 'task': self.task, 'application': self.application, 'num_iterations': self.num_iterations, 'num_leaves': self.num_leaves, 'device': self.device, 'min_data_in_leaf': self.min_data_in_leaf, 'feature_fraction': self.feature_fraction, 'bagging_fraction': self.bagging_fraction, 'min_gain_to_split': self.min_gain_to_split, 'num_threads': self.num_threads, 'max_depth': self.max_depth, 'verbosity': -1 } # light gbm self.models[review_group] = lgb.train(params, data_dmatrix)
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if data has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') # check for vectorizers if self.tokens_col is not None: vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') self.vectorizer = vec if self.tokens_col2 is not None: vec2 = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2') self.vectorizer2 = vec2 else: if self.tokens_col is not None: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) self.vectorizer = vec if self.tokens_col2 is not None: tokenized_papers2 = x_train[self.tokens_col2].apply( lambda x: np.str_(x)) vec2 = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f2, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df, decode_error='ignore') X2 = vec2.fit_transform(tokenized_papers2) try: X = hstack([X, X2]) except: X = X2 save(vec2, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2', persist=True) self.vectorizer2 = vec2 if self.citations_cols is not None: X3 = csr_matrix(x_train[self.citations_cols].values) try: X = hstack([X, X3]) except: X = X3 save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # Load results table from dbs results_df = pull_results(ignition_ids=[ '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21' ]) # Get a dataframe of best algorithm x hyperparameters for each RG x recall best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df) # Get dictionary of algorithms and hyperparameters for each # review group based on recall in product config file best_models = choose_models_with_recall( models_df=best_df, group_min_recalls=prod_config['review_groups_recall']) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) # Train best models for each review group train_best_models_mp(X_train, y_train, best_models=best_models, prod_config=prod_config, local_paths=local_paths, cores=3) if evaluate_best_models: # Test best models for each review group scored_papers_test = score_papers( X_test, prod_config, models_path=local_paths['store_production_models']) save(object=scored_papers_test, location=local_paths['store_scored_papers'], filename='scored_papers_citations') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] # evaluate scores for recall in tqdm(prod_config['recalls'], desc='Evaluations'): # calculate precisions precisions = evaluate_precision_at_k_recall( class_true=y_test, class_prob=y_pred_test, k=recall) # store results in database production_results_to_db( table_name=prod_config['results_table_name'], unique_id=f"{rg}_{recall}", review_group=rg, algorithm=best_models[review_group]['algorithm'], hyperparameters=best_models[review_group] ['hyperparameters'], recall=recall, precision=precisions[rg], thresholds=thresholds, recall_at_threshold=recall_at_threshold, workload_reduction=workload_reduction, connection=connection) connection.close() print("Model selection pipeline complete.")
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') if self.tokens_col2 is not None: vec2 = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2') self.vectorizer2 = vec2 else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) if self.tokens_col2 is not None: tokenized_papers2 = x_train[self.tokens_col2].apply( lambda x: np.str_(x)) vec2 = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f2, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df, decode_error='ignore') X2 = vec2.fit_transform(tokenized_papers2) X = hstack([X, X2]) save(vec2, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2', persist=True) self.vectorizer2 = vec2 save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] #initiate random forest model classifier = RandomForestClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, random_state=self.seed, n_jobs=self.n_jobs).fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier