def new_aggregate_flow(data, channel): current = persist.load(data['project_id']) current[channel] = data persist.write(data['project_id'], current) print("PERSIST: ", current) if(len(current.keys()) < 2): return False return current
def predict(self, papers): """ Generates predictions from the trained classifiers. Each binary classifier is applied once. Parameters ========== papers : pd.DataFrame papers that we want to classify. Required column: tokens_baseline - previously tokenized title-abstract Returns ======= scores : pd.DataFrame Dataframe containing the predictions generated by each model. Each column corresponds to a review group and the values in that column are the probabilities that each paper belong to that review group. """ scores = {} tokenized_papers = list(papers[self.tokens_col]) # get embeddings for papers if check_persisted(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y', self.load_fresh): weighted_embeddings = load(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y') else: self.embeddings_model = load_word2vec(self.env['word2vec_model']) vec = self.vectorizer X = vec.transform(tokenized_papers) weighted_embeddings = np.array(self.create_embeddings(X, vec)) save(weighted_embeddings, f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_y', persist=True) for model_group in tqdm(self.models, desc='Test Review Groups'): # get the classifier classifier = self.models[model_group] # predictions as probabilities y_preds = classifier.predict_proba(weighted_embeddings) probabilities = y_preds[:, 1] # store scores of model scores[model_group] = probabilities scores = pd.DataFrame.from_dict(scores) return scores
def __call__(self, *args, **kwds): key = (tuple(args), tuple(kwds.items())) h = hash(key) name = '%s/%s_%s.sobj'%(self.__dir, self.__func.func_name, h) if os.path.exists(name): key2, val = persist.load(name) if key == key2: # We save and test equality of keys to avoid # the (extremely remote) possibility of a hash # collision. Correctness is crucial in mathematics. return val val = self.__func(*args, **kwds) persist.save((key, val), name) return val
def __call__(self, *args, **kwds): key = (tuple(args), tuple(kwds.items())) h = hash(key) name = '%s/%s_%s.sobj' % (self.__dir, self.__func.__name__, h) if os.path.exists(name): key2, val = persist.load(name) if key == key2: # We save and test equality of keys to avoid # the (extremely remote) possibility of a hash # collision. Correctness is crucial in mathematics. return val val = self.__func(*args, **kwds) persist.save((key, val), name) return val
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X', self.load_fresh): pca = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_pca') vec = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X_tfidf = vec.fit_transform(tokenized_papers) # reduce dimensionality of tf-idf features through PCA pca = TruncatedSVD(n_components=self.n_components, random_state=self.seed) X = pca.fit_transform(X_tfidf) save(pca, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_pca', persist=True) save(vec, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/pca", f'{self.vectorizer_hash}_X', persist=True) self.pca = pca self.vectorizer = vec # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column corresponding to the tokens to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec if check_persisted(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X', self.load_fresh): weighted_embeddings = load(f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X') else: self.embeddings_model = load_word2vec(self.env['word2vec_model']) weighted_embeddings = np.array(self.create_embeddings(X, vec)) save(weighted_embeddings, f"{self.env['store_misc']}/embeddings", f'{self.vectorizer_hash}_X', persist=True) # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] #initiate random forest model classifier = RandomForestClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, random_state=self.seed, n_jobs=self.n_jobs).fit(weighted_embeddings, labels) # save the model in dictionary of models self.models[review_group] = classifier
def run_pipeline(ignition_file, persist_all, load_all_fresh): """ An adhoc pipeline created to mirror the standard ML pipeline and work with citations data. Parameters: =========== ignition_file: string name of the yaml file for which you want to run an experiment persist_all: boolean T if you want to persist all data for future use load_all_fresh: boolean T if you want to avoid any persisted data and load new data from scrath Returns: ======== None """ model_parts = {} ##### 1. LOAD ENVIRONMENT DATA ##### # load local paths local_paths_env = load_local_paths('local_paths.yaml') print('Local paths loaded.') # load ignition file ignition = load_config(local_paths_env['ignition_path'] + ignition_file) print('Ignition loaded.') # id used for persisting hash_id = create_hash_id(str(ignition['id'])) print('Hash id created.') # create hyperparameter combinations (for k-folding) hyperparameters = expand_grid(ignition['hyperparameters']) # load environment file psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path']) print('PSQL environment file loaded.') # Initiate PSQL Connection connection = SQLConn(psql_env) connection.open() ##### 2. LOAD TRAIN AND TEST DATA ##### if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x', load_all_fresh): print("Found data") # data loaded before: load from file X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x') X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x') y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y') y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y') print('Loaded data from file.') else: print("Data not found in storage - load from database") # data not loaded: pull from database and create features X_train, X_test, y_train, y_test = sample( ignition, connection, local_paths_env['store_features']) print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") # add fold index column to data X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'], ignition['k_folds_seed']) # save data to file for future use save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x', persist_all) save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x', persist_all) save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y', persist_all) save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y', persist_all) print('Data loading completed.') ##### 3. K-FOLDING ##### # loop over folds for fold in tqdm(range(ignition['k_folds']), desc='Folds'): # get fold id hash (for persisting) fold_id = create_hash_id(str(ignition['id']) + str(fold)) # get fold data fold_X_train = X_train[X_train['k'] != fold] fold_X_test = X_train[X_train['k'] == fold] fold_y_train = y_train[y_train['k'] != fold] fold_y_test = y_train[y_train['k'] == fold] # store fold features, if any fold_features = {} ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER ##### for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'): # create hyperparam unique id and hyperparam-fold unique id hyperparam_id = create_hash_id( str(ignition['id']) + str(hyperparam)) hyperparam_fold_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold)) # if not check_val_in_db(connection, ignition['results_table_name'], # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])): # create classifier of specified type and with specified target classifier = select_classifier(ignition["model_type"], fold_id, ignition["target"], ignition["classes"], fold_features, hyperparameters=hyperparam, seed=ignition['seed'], env=local_paths_env, load_fresh=load_all_fresh) #print('Classifier created.') # train classifier classifier.train(fold_X_train, fold_y_train) ##### 5. TEST CLASSIFIER ##### # generate predictions from classifier y_probs = classifier.predict(fold_X_test) ##### 6. EVALUATION ##### for recall in tqdm(ignition['recalls'], desc='Evaluations'): # compute evaluation metrics all_metrics = compute_metrics( metric_names=ignition['metrics'], y_true=fold_y_test.drop(columns=['k']), y_pred=y_probs, k=recall) # store results in database unique_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold) + str(recall)) results_to_db(metrics=all_metrics, table_name=ignition['results_table_name'], ignition_id=ignition['id'], hash_id=hyperparam_fold_id, algorithm=ignition['model_type'], hyperparameters=hyperparam, fold=str(fold), recall=recall, unique_id=unique_id, connection=connection) connection.close() print(f"Done running pipeline for ignition id: {ignition['id']}!")
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column for the token to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') X = hstack( [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X]) else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) X = hstack( [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X]) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # Create data structure for XGBoost data_dmatrix = xgb.DMatrix(data=X, label=labels) # creating parameters for xgboost params = { 'objective': self.objective, 'learning_rate': self.learning_rate, 'max_depth': self.max_depth, 'subsample': self.subsample, 'colsample_bytree': self.colsample_bytree, 'n_estimators': self.n_estimators, 'objective': self.objective, 'gamma': self.gamma, 'alpha': self.l1, 'lambda': self.l2 } # xgboost self.models[review_group] = xgb.train(params, data_dmatrix)
def load(self): try: UserData.data = persist.load(self.persist) except: UserData.data = {}
def train(self, x_train, y_train): """ Trains one classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with a column for the token to use. y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X]) else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode') # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X]) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # Create data structure for light gbm data_dmatrix = lgb.Dataset(data=X, label=labels) # creating parameters for light gbm params = { 'task': self.task, 'application': self.application, 'num_iterations': self.num_iterations, 'num_leaves': self.num_leaves, 'device': self.device, 'min_data_in_leaf': self.min_data_in_leaf, 'feature_fraction': self.feature_fraction, 'bagging_fraction': self.bagging_fraction, 'min_gain_to_split': self.min_gain_to_split, 'num_threads': self.num_threads, 'max_depth': self.max_depth, 'verbosity': -1 } # light gbm self.models[review_group] = lgb.train(params, data_dmatrix)
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if data has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') # check for vectorizers if self.tokens_col is not None: vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') self.vectorizer = vec if self.tokens_col2 is not None: vec2 = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2') self.vectorizer2 = vec2 else: if self.tokens_col is not None: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) self.vectorizer = vec if self.tokens_col2 is not None: tokenized_papers2 = x_train[self.tokens_col2].apply( lambda x: np.str_(x)) vec2 = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f2, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df, decode_error='ignore') X2 = vec2.fit_transform(tokenized_papers2) try: X = hstack([X, X2]) except: X = X2 save(vec2, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2', persist=True) self.vectorizer2 = vec2 if self.citations_cols is not None: X3 = csr_matrix(x_train[self.citations_cols].values) try: X = hstack([X, X3]) except: X = X3 save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with as columns (at least): tokens_baseline - previously tokenized title-abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') if self.tokens_col2 is not None: vec2 = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2') self.vectorizer2 = vec2 else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) if self.tokens_col2 is not None: tokenized_papers2 = x_train[self.tokens_col2].apply( lambda x: np.str_(x)) vec2 = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f2, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df, decode_error='ignore') X2 = vec2.fit_transform(tokenized_papers2) X = hstack([X, X2]) save(vec2, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec2', persist=True) self.vectorizer2 = vec2 save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] #initiate random forest model classifier = RandomForestClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, random_state=self.seed, n_jobs=self.n_jobs).fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
def train(self, x_train, y_train): """ Trains one elastic logistic classifier per review group. Saves the trained classifiers within self.models. Parameters ========== x_train : pandas DataFrame DataFrame containing the papers we aim to classify, with at least the column: average_embeddings - average word embeddings for concatenated title and abstract y_train : pandas DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. """ # check if data has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X', self.load_fresh): pca = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_pca') X = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X') else: # convert input to format for classifier list_of_embeddings = list(x_train[self.embeddings_col]) x_train = np.array( [[float(i) for i in embedding.strip('[]').split()] for embedding in list_of_embeddings]) # reduce dimensionality of embeddings through PCA pca = PCA(n_components=self.n_components, random_state=self.seed) X = pca.fit_transform(x_train) save(pca, f"{self.env['store_misc']}/pca", f'{self.fold_hash}_pca', persist=True) save(X, f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X', persist=True) self.pca = pca # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # logistic classifier classifier = SGDClassifier(loss="log", alpha=self.alpha, l1_ratio=self.l1_ratio, penalty="elasticnet").fit(X, labels) # save the model in dictionary of models self.models[review_group] = classifier
from flask import Flask, jsonify import subprocess log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) appformatter = logging.Formatter( '[%(asctime)s] - %(name)s - %(levelname)s - %(message)s') consolehandler = logging.StreamHandler() consolehandler.setFormatter(appformatter) log.addHandler(consolehandler) SETTINGS_FILE = 'persist.json' SENSOR_UPDATE_INTERVAL = 5 # ------------------------------------------------------------------------- SETTINGS persist.load(SETTINGS_FILE) # ------------------------------------------------------------------------- BREWERY sensors = [ TemperatureSensor("red"), TemperatureSensor("blue"), TemperatureSensor("green") ] controller = TemperatureController(control_pin=11, time_period=10) mode = "IDLE" prep_start_time = None boil_start_time = None mash_start_time = None ferment_start_time = None
def train(self, x_train, y_train): """ Trains classifier for each review group and stores it in a dictionary that is a class attribute. Parameters ========== x_train : pd.DataFrame Dataframe with columns corresponding to features to include in the model. y_train : pd.DataFrame DataFrame containing the labels for each paper. Each column represents one review group with binary labels. Returns ======= None """ ### preprocess ### if self.tfidf: # check if vectorizer has been created before, if so load from file if check_persisted(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', self.load_fresh): vec = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec') X = load(f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X') else: # get the tokenized papers tokenized_papers = list(x_train[self.tokens_col]) vec = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_vocab_f, strip_accents='unicode', token_pattern=self.token_pattern, min_df=self.min_df) # generate term document matrix (model inputs) X = vec.fit_transform(tokenized_papers) save(vec, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_vec', persist=True) save(X, f"{self.env['store_misc']}/tfidf", f'{self.vectorizer_hash}_X', persist=True) self.vectorizer = vec else: X = x_train # discard fold ID column from labels review_groups = [col for col in y_train.columns if not col == 'k'] # train for review_group in tqdm(review_groups, desc='Train Review Groups'): # pull label column labels = y_train[review_group] # creating parameters for xgboost classifier = AdaBoostClassifier(base_estimator=self.base_estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm, random_state=self.seed).fit( X, labels) # save classifier to class attribute self.models[review_group] = classifier
def perform_model_selection(evaluate_best_models=True): """ Run model selection pipeline. """ # Load local paths file local_paths = load_local_paths('local_paths.yaml') # Load product config file prod_config = load_config('../prod/prod_config.yaml', append_static=False) # SQL set up psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path']) connection = SQLConn(psql_env) connection.open() # Pull data X_train, X_test, y_train, y_test = sample( ignition=prod_config, connection=connection, local_features_path=local_paths['store_features']) if evaluate_best_models: # Test best models for each review group scored_papers_test = load(location=local_paths['store_scored_papers'], filename='scored_papers') y_pred_test = scored_papers_test[[ col for col in scored_papers_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] y_test = y_test[[ col for col in y_test.columns if col.upper() in prod_config['review_groups_recall'].keys() ]] # calculate thresholds upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds( y_test, y_pred_test, minimum_recall=0.99) # persist thresholds for production save(upper_thresholds, local_paths['store_production_models'], 'upper_thresholds') save(lower_thresholds, local_paths['store_production_models'], 'lower_thresholds') # calculate workload reductions keep, consider, discard = get_workload_reduction( y_test, y_pred_test, upper_thresholds, lower_thresholds) rg_list = [] wrkld_reductions = [] # loop over review groups for review_group in tqdm(prod_config['review_groups_recall'].keys(), desc='Review Group'): rg = review_group.lower() # get thresholds thresholds = [upper_thresholds[rg], lower_thresholds[rg]] recall_at_threshold = [ recall_at_thresholds[rg]['upper'], recall_at_thresholds[rg]['lower'] ] workload_reduction = [keep[rg], consider[rg], discard[rg]] rg_list.append(rg) wrkld_reductions.append(workload_reduction) d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions} df = pd.DataFrame.from_dict(d) plot_average_workload_reduction(df) connection.close() print("Model selection pipeline complete.")
def load(self): try: FaceData.data = persist.load(self.persist) except: FaceData.data = {}