예제 #1
0
def test_save_and_load_big():
    gb = setup_big()
    save(gb, 'test.json')
    gb2 = read('test.json')
    assert len(gb2.students) == 50
    assert len(gb2.gradeables) == 21
    assert len(gb2.scores) == 20 * 50 * 20
예제 #2
0
    def save(self):

        if self.save_every <= 1:
            persist.save(UserData.data, self.persist)
            self.save_every = 3
        else:
            self.save_every -= 1
예제 #3
0
def test_save_and_load():
    gb = setup()
    save(gb, 'test.json')
    gb2 = read('test.json')
    assert len(gb2.students) == 2
    assert len(gb2.categories) == 2
    assert len(gb2.gradeables) == 3
    def predict(self, papers):
        """
        Generates predictions from the trained classifiers. Each binary
        classifier is applied once.

        Parameters
        ==========

        papers : pd.DataFrame
            papers that we want to classify. Required column:
                tokens_baseline - previously tokenized title-abstract

        Returns
        =======
        
        scores : pd.DataFrame
            Dataframe containing the predictions generated by each model.
            Each column corresponds to a review group and the values in
            that column are the probabilities that each paper belong to
            that review group.
        """

        scores = {}

        tokenized_papers = list(papers[self.tokens_col])

        # get embeddings for papers
        if check_persisted(f"{self.env['store_misc']}/embeddings",
                           f'{self.vectorizer_hash}_y', self.load_fresh):
            weighted_embeddings = load(f"{self.env['store_misc']}/embeddings",
                                       f'{self.vectorizer_hash}_y')

        else:
            self.embeddings_model = load_word2vec(self.env['word2vec_model'])
            vec = self.vectorizer
            X = vec.transform(tokenized_papers)
            weighted_embeddings = np.array(self.create_embeddings(X, vec))
            save(weighted_embeddings,
                 f"{self.env['store_misc']}/embeddings",
                 f'{self.vectorizer_hash}_y',
                 persist=True)

        for model_group in tqdm(self.models, desc='Test Review Groups'):

            # get the classifier
            classifier = self.models[model_group]

            # predictions as probabilities
            y_preds = classifier.predict_proba(weighted_embeddings)

            probabilities = y_preds[:, 1]

            # store scores of model
            scores[model_group] = probabilities

        scores = pd.DataFrame.from_dict(scores)

        return scores
예제 #5
0
파일: main.py 프로젝트: nagyben/rpi-brew
def update_settings():
    persist.settings['redId'] = sensors[0].sensor_id
    persist.settings['blueId'] = sensors[1].sensor_id
    persist.settings['greenId'] = sensors[2].sensor_id
    persist.settings['setpointC'] = controller.setpoint
    persist.settings['logEnabled'] = logging_enabled
    persist.settings['controlEnabled'] = controller.enabled
    persist.settings['mode'] = mode
    persist.settings['sg'] = specific_gravity
    persist.settings['fermentStart'] = ferment_start_time
    persist.save(SETTINGS_FILE)
예제 #6
0
def train_best_models(keys, X_train, y_train, best_models, prod_config,
                      local_paths):
    """
    Train and store model objects for each review group.
    Parameters
    ==========
    X_train : DataFrame
        Training features
    y_train : DataFrame
        Training labels
    best_models : dict
        Dictionary where key=review group and value=dictionary with best algorithm
        and hyperparameters for that group and specified minimum recall value.
    prod_config : dict
        Config file for production pipeline. Includes recall values for each group and
        features to pull into training data.
    local_paths : dict
        Local directory paths, used to store models for production.
    keys: list of key values to subset best models by

    Returns
    =======
    None
    """
    # subset best models
    best_models = dict((k, best_models[k]) for k in (keys))

    # Loop through review groups and train model on all data
    #for review_group, params in tqdm(best_models.items(), desc='Training Review Group Production Models'):
    for review_group, params in best_models.items():

        print(f'training {review_group}')

        classifier = select_classifier(
            classifier_name=params['algorithm'],
            fold_hash='prod',
            target=None,
            classes=list(prod_config['classes']),
            model_parts={},
            hyperparameters=eval(params['hyperparameters']),
            seed=prod_config['seed'],
            citations_cols=prod_config['citations_cols'],
            env=local_paths,
            load_fresh=False)
        classifier.train(X_train, y_train[[review_group.lower()]])

        # Store models locally
        save(object=classifier,
             location=local_paths['store_production_models'],
             filename=f"prod_models_{review_group}")
        print(f'training {review_group} done')
예제 #7
0
    def __call__(self, *args, **kwds):
        key = (tuple(args), tuple(kwds.items()))
        h = hash(key)
        name = '%s/%s_%s.sobj' % (self.__dir, self.__func.__name__, h)

        if os.path.exists(name):
            key2, val = persist.load(name)
            if key == key2:
                # We save and test equality of keys to avoid
                # the (extremely remote) possibility of a hash
                # collision.  Correctness is crucial in mathematics.
                return val

        val = self.__func(*args, **kwds)
        persist.save((key, val), name)
        return val
예제 #8
0
파일: func_persist.py 프로젝트: CETHop/sage
    def __call__(self, *args, **kwds):
        key = (tuple(args), tuple(kwds.items()))
        h = hash(key)
        name = '%s/%s_%s.sobj'%(self.__dir, self.__func.func_name, h)

        if os.path.exists(name):
            key2, val = persist.load(name)
            if key == key2:
                # We save and test equality of keys to avoid
                # the (extremely remote) possibility of a hash
                # collision.  Correctness is crucial in mathematics.
                return val

        val = self.__func(*args, **kwds)
        persist.save((key, val), name)
        return val
예제 #9
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = load(location=local_paths['store_scored_papers'],
                                  filename='scored_papers')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        rg_list = []
        wrkld_reductions = []

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            rg_list.append(rg)
            wrkld_reductions.append(workload_reduction)

        d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions}
        df = pd.DataFrame.from_dict(d)
        plot_average_workload_reduction(df)

    connection.close()

    print("Model selection pipeline complete.")
예제 #10
0
    def train(self, x_train, y_train):
        """
        Trains classifier for each review group and stores
        it in a dictionary that is a class attribute.

        Parameters
        ==========
        x_train : pd.DataFrame
            Dataframe with columns corresponding to features to include in the model.
        y_train : pd.DataFrame
            DataFrame containing the labels for each paper. Each column represents one
            review group with binary labels.

        Returns
        =======
        None
        """
        ### preprocess ###
        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode',
                                      token_pattern=self.token_pattern,
                                      min_df=self.min_df)

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)

                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)

            self.vectorizer = vec

        else:

            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        # train
        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # creating parameters for xgboost
            classifier = AdaBoostClassifier(base_estimator=self.base_estimator,
                                            n_estimators=self.n_estimators,
                                            learning_rate=self.learning_rate,
                                            algorithm=self.algorithm,
                                            random_state=self.seed).fit(
                                                X, labels)

            # save classifier to class attribute
            self.models[review_group] = classifier
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/pca",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            pca = load(f"{self.env['store_misc']}/pca",
                       f'{self.vectorizer_hash}_pca')
            vec = load(f"{self.env['store_misc']}/pca",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/pca",
                     f'{self.vectorizer_hash}_X')

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])

            vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                  max_features=self.max_vocab_f,
                                  strip_accents='unicode')

            # generate term document matrix (model inputs)
            X_tfidf = vec.fit_transform(tokenized_papers)

            # reduce dimensionality of tf-idf features through PCA
            pca = TruncatedSVD(n_components=self.n_components,
                               random_state=self.seed)
            X = pca.fit_transform(X_tfidf)

            save(pca,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_pca',
                 persist=True)
            save(vec,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.pca = pca
        self.vectorizer = vec

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #12
0
""" yacht control  

"""

import sys
import speak
import time
import persist

from menu import *

# the following are application specific so  should be in the menu
from switch import *
from barometer import *

persist.save(Switch(),'barotalk') 
persist.save(Switch(),'anchortalk')
            
def visit(item) :
   action = item.getAttribute('action')
   if action == "" :
      text = item.getAttribute('title')
   else : 
      text = eval(action)
   speak.say(text)
   print text

name = sys.argv[1]
menu = Menu(name)
menu.run(visit)  
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column corresponding to the tokens to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            vec = load(f"{self.env['store_misc']}/tfidf",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])

            vec = TfidfVectorizer(max_features=self.max_vocab_f,
                                  strip_accents='unicode')

            # generate term document matrix (model inputs)
            X = vec.fit_transform(tokenized_papers)

            save(vec,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.vectorizer = vec

        if check_persisted(f"{self.env['store_misc']}/embeddings",
                           f'{self.vectorizer_hash}_X', self.load_fresh):
            weighted_embeddings = load(f"{self.env['store_misc']}/embeddings",
                                       f'{self.vectorizer_hash}_X')

        else:
            self.embeddings_model = load_word2vec(self.env['word2vec_model'])
            weighted_embeddings = np.array(self.create_embeddings(X, vec))
            save(weighted_embeddings,
                 f"{self.env['store_misc']}/embeddings",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            #initiate random forest model
            classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.seed,
                n_jobs=self.n_jobs).fit(weighted_embeddings, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #14
0
def run_pipeline(ignition_file, persist_all, load_all_fresh):
    """
    An adhoc pipeline created to mirror the standard ML pipeline and work
    with citations data.

    Parameters:
    ===========
    ignition_file: string
        name of the yaml file for which you want to run an experiment

    persist_all: boolean
        T if you want to persist all data for future use

    load_all_fresh: boolean
        T if you want to avoid any persisted data and load new data from scrath

    Returns:
    ========
    None
    """

    model_parts = {}

    ##### 1. LOAD ENVIRONMENT DATA #####

    # load local paths
    local_paths_env = load_local_paths('local_paths.yaml')
    print('Local paths loaded.')

    # load ignition file
    ignition = load_config(local_paths_env['ignition_path'] + ignition_file)
    print('Ignition loaded.')

    # id used for persisting
    hash_id = create_hash_id(str(ignition['id']))
    print('Hash id created.')

    # create hyperparameter combinations (for k-folding)
    hyperparameters = expand_grid(ignition['hyperparameters'])

    # load environment file
    psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path'])
    print('PSQL environment file loaded.')

    # Initiate PSQL Connection
    connection = SQLConn(psql_env)
    connection.open()

    ##### 2. LOAD TRAIN AND TEST DATA #####

    if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x',
                       load_all_fresh):

        print("Found data")

        # data loaded before: load from file
        X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x')
        X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x')
        y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y')
        y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y')

        print('Loaded data from file.')

    else:

        print("Data not found in storage - load from database")

        # data not loaded: pull from database and create features
        X_train, X_test, y_train, y_test = sample(
            ignition, connection, local_paths_env['store_features'])
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # add fold index column to data
        X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'],
                                  ignition['k_folds_seed'])

        # save data to file for future use
        save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x',
             persist_all)
        save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x',
             persist_all)
        save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y',
             persist_all)
        save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y',
             persist_all)

    print('Data loading completed.')

    ##### 3. K-FOLDING #####

    # loop over folds
    for fold in tqdm(range(ignition['k_folds']), desc='Folds'):

        # get fold id hash (for persisting)
        fold_id = create_hash_id(str(ignition['id']) + str(fold))

        # get fold data
        fold_X_train = X_train[X_train['k'] != fold]
        fold_X_test = X_train[X_train['k'] == fold]
        fold_y_train = y_train[y_train['k'] != fold]
        fold_y_test = y_train[y_train['k'] == fold]

        # store fold features, if any
        fold_features = {}

        ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER #####

        for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'):

            # create hyperparam unique id and hyperparam-fold unique id
            hyperparam_id = create_hash_id(
                str(ignition['id']) + str(hyperparam))
            hyperparam_fold_id = create_hash_id(
                str(ignition['id']) + str(hyperparam) + str(fold))

            # if not check_val_in_db(connection, ignition['results_table_name'],
            # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])):

            # create classifier of specified type and with specified target
            classifier = select_classifier(ignition["model_type"],
                                           fold_id,
                                           ignition["target"],
                                           ignition["classes"],
                                           fold_features,
                                           hyperparameters=hyperparam,
                                           seed=ignition['seed'],
                                           env=local_paths_env,
                                           load_fresh=load_all_fresh)
            #print('Classifier created.')

            # train classifier
            classifier.train(fold_X_train, fold_y_train)

            ##### 5. TEST CLASSIFIER #####

            # generate predictions from classifier
            y_probs = classifier.predict(fold_X_test)

            ##### 6. EVALUATION #####

            for recall in tqdm(ignition['recalls'], desc='Evaluations'):

                # compute evaluation metrics
                all_metrics = compute_metrics(
                    metric_names=ignition['metrics'],
                    y_true=fold_y_test.drop(columns=['k']),
                    y_pred=y_probs,
                    k=recall)

                # store results in database
                unique_id = create_hash_id(
                    str(ignition['id']) + str(hyperparam) + str(fold) +
                    str(recall))

                results_to_db(metrics=all_metrics,
                              table_name=ignition['results_table_name'],
                              ignition_id=ignition['id'],
                              hash_id=hyperparam_fold_id,
                              algorithm=ignition['model_type'],
                              hyperparameters=hyperparam,
                              fold=str(fold),
                              recall=recall,
                              unique_id=unique_id,
                              connection=connection)

    connection.close()
    print(f"Done running pipeline for ignition id: {ignition['id']}!")
예제 #15
0
def save_current(gb):
    persist.save(gb, gb.file_name())
    persist.log_config_warnings(gb)
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column for the token to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')
                X = hstack(
                    [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X])

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode')

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)
                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)
                X = hstack(
                    [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X])

            self.vectorizer = vec

        else:
            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # Create data structure for XGBoost
            data_dmatrix = xgb.DMatrix(data=X, label=labels)

            # creating parameters for xgboost
            params = {
                'objective': self.objective,
                'learning_rate': self.learning_rate,
                'max_depth': self.max_depth,
                'subsample': self.subsample,
                'colsample_bytree': self.colsample_bytree,
                'n_estimators': self.n_estimators,
                'objective': self.objective,
                'gamma': self.gamma,
                'alpha': self.l1,
                'lambda': self.l2
            }
            # xgboost
            self.models[review_group] = xgb.train(params, data_dmatrix)
예제 #17
0
def save_and_exit(gb):
    persist.save(gb, gb.file_name())
    persist.log_config_warnings(gb)
    menus.m_main.close()
예제 #18
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with at least the column:
                average_embeddings - average word embeddings for concatenated title and abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if data has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/pca",
                           f'{self.fold_hash}_X', self.load_fresh):

            pca = load(f"{self.env['store_misc']}/pca",
                       f'{self.fold_hash}_pca')
            X = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X')

        else:

            # convert input to format for classifier
            list_of_embeddings = list(x_train[self.embeddings_col])
            x_train = np.array(
                [[float(i) for i in embedding.strip('[]').split()]
                 for embedding in list_of_embeddings])

            # reduce dimensionality of embeddings through PCA
            pca = PCA(n_components=self.n_components, random_state=self.seed)
            X = pca.fit_transform(x_train)

            save(pca,
                 f"{self.env['store_misc']}/pca",
                 f'{self.fold_hash}_pca',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/pca",
                 f'{self.fold_hash}_X',
                 persist=True)

        self.pca = pca

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #19
0
    def train(self, x_train, y_train):
        """
        Trains one classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column for the token to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')
                #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X])

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode')

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)
                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)
                #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X])

            self.vectorizer = vec

        else:
            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # Create data structure for light gbm
            data_dmatrix = lgb.Dataset(data=X, label=labels)

            # creating parameters for light gbm
            params = {
                'task': self.task,
                'application': self.application,
                'num_iterations': self.num_iterations,
                'num_leaves': self.num_leaves,
                'device': self.device,
                'min_data_in_leaf': self.min_data_in_leaf,
                'feature_fraction': self.feature_fraction,
                'bagging_fraction': self.bagging_fraction,
                'min_gain_to_split': self.min_gain_to_split,
                'num_threads': self.num_threads,
                'max_depth': self.max_depth,
                'verbosity': -1
            }
            # light gbm
            self.models[review_group] = lgb.train(params, data_dmatrix)
예제 #20
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if data has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

            # check for vectorizers
            if self.tokens_col is not None:
                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                self.vectorizer = vec

            if self.tokens_col2 is not None:
                vec2 = load(f"{self.env['store_misc']}/tfidf",
                            f'{self.vectorizer_hash}_vec2')
                self.vectorizer2 = vec2

        else:

            if self.tokens_col is not None:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])
                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode',
                                      token_pattern=self.token_pattern,
                                      min_df=self.min_df)

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)

                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                self.vectorizer = vec

            if self.tokens_col2 is not None:

                tokenized_papers2 = x_train[self.tokens_col2].apply(
                    lambda x: np.str_(x))
                vec2 = TfidfVectorizer(ngram_range=self.ngram_range,
                                       max_features=self.max_vocab_f2,
                                       strip_accents='unicode',
                                       token_pattern=self.token_pattern,
                                       min_df=self.min_df,
                                       decode_error='ignore')

                X2 = vec2.fit_transform(tokenized_papers2)

                try:
                    X = hstack([X, X2])
                except:
                    X = X2

                save(vec2,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec2',
                     persist=True)
                self.vectorizer2 = vec2

            if self.citations_cols is not None:

                X3 = csr_matrix(x_train[self.citations_cols].values)

                try:
                    X = hstack([X, X3])
                except:
                    X = X3

            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #21
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # Load results table from dbs
    results_df = pull_results(ignition_ids=[
        '1', '2', '4', '5', '6', '10', '15', '16', '17', '18', '19', '20', '21'
    ])

    # Get a dataframe of best algorithm x hyperparameters for each RG x recall
    best_df = get_best_algorithm_hyperparameter_onestep(results_df=results_df)

    # Get dictionary of algorithms and hyperparameters for each
    # review group based on recall in product config file
    best_models = choose_models_with_recall(
        models_df=best_df,
        group_min_recalls=prod_config['review_groups_recall'])

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    # Train best models for each review group
    train_best_models_mp(X_train,
                         y_train,
                         best_models=best_models,
                         prod_config=prod_config,
                         local_paths=local_paths,
                         cores=3)

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = score_papers(
            X_test,
            prod_config,
            models_path=local_paths['store_production_models'])
        save(object=scored_papers_test,
             location=local_paths['store_scored_papers'],
             filename='scored_papers_citations')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_precision=0.95, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            # evaluate scores
            for recall in tqdm(prod_config['recalls'], desc='Evaluations'):

                # calculate precisions
                precisions = evaluate_precision_at_k_recall(
                    class_true=y_test, class_prob=y_pred_test, k=recall)

                # store results in database
                production_results_to_db(
                    table_name=prod_config['results_table_name'],
                    unique_id=f"{rg}_{recall}",
                    review_group=rg,
                    algorithm=best_models[review_group]['algorithm'],
                    hyperparameters=best_models[review_group]
                    ['hyperparameters'],
                    recall=recall,
                    precision=precisions[rg],
                    thresholds=thresholds,
                    recall_at_threshold=recall_at_threshold,
                    workload_reduction=workload_reduction,
                    connection=connection)

    connection.close()

    print("Model selection pipeline complete.")
예제 #22
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            vec = load(f"{self.env['store_misc']}/tfidf",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

            if self.tokens_col2 is not None:
                vec2 = load(f"{self.env['store_misc']}/tfidf",
                            f'{self.vectorizer_hash}_vec2')
                self.vectorizer2 = vec2

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])
            vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                  max_features=self.max_vocab_f,
                                  strip_accents='unicode',
                                  token_pattern=self.token_pattern,
                                  min_df=self.min_df)

            # generate term document matrix (model inputs)
            X = vec.fit_transform(tokenized_papers)

            if self.tokens_col2 is not None:

                tokenized_papers2 = x_train[self.tokens_col2].apply(
                    lambda x: np.str_(x))
                vec2 = TfidfVectorizer(ngram_range=self.ngram_range,
                                       max_features=self.max_vocab_f2,
                                       strip_accents='unicode',
                                       token_pattern=self.token_pattern,
                                       min_df=self.min_df,
                                       decode_error='ignore')

                X2 = vec2.fit_transform(tokenized_papers2)
                X = hstack([X, X2])

                save(vec2,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec2',
                     persist=True)
                self.vectorizer2 = vec2

            save(vec,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.vectorizer = vec

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            #initiate random forest model
            classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.seed,
                n_jobs=self.n_jobs).fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier