예제 #1
0
def new_aggregate_flow(data, channel):
    current = persist.load(data['project_id'])
    current[channel] = data
    persist.write(data['project_id'], current)
    print("PERSIST: ", current)
    if(len(current.keys()) < 2):
        return False
    return current
    def predict(self, papers):
        """
        Generates predictions from the trained classifiers. Each binary
        classifier is applied once.

        Parameters
        ==========

        papers : pd.DataFrame
            papers that we want to classify. Required column:
                tokens_baseline - previously tokenized title-abstract

        Returns
        =======
        
        scores : pd.DataFrame
            Dataframe containing the predictions generated by each model.
            Each column corresponds to a review group and the values in
            that column are the probabilities that each paper belong to
            that review group.
        """

        scores = {}

        tokenized_papers = list(papers[self.tokens_col])

        # get embeddings for papers
        if check_persisted(f"{self.env['store_misc']}/embeddings",
                           f'{self.vectorizer_hash}_y', self.load_fresh):
            weighted_embeddings = load(f"{self.env['store_misc']}/embeddings",
                                       f'{self.vectorizer_hash}_y')

        else:
            self.embeddings_model = load_word2vec(self.env['word2vec_model'])
            vec = self.vectorizer
            X = vec.transform(tokenized_papers)
            weighted_embeddings = np.array(self.create_embeddings(X, vec))
            save(weighted_embeddings,
                 f"{self.env['store_misc']}/embeddings",
                 f'{self.vectorizer_hash}_y',
                 persist=True)

        for model_group in tqdm(self.models, desc='Test Review Groups'):

            # get the classifier
            classifier = self.models[model_group]

            # predictions as probabilities
            y_preds = classifier.predict_proba(weighted_embeddings)

            probabilities = y_preds[:, 1]

            # store scores of model
            scores[model_group] = probabilities

        scores = pd.DataFrame.from_dict(scores)

        return scores
예제 #3
0
파일: func_persist.py 프로젝트: CETHop/sage
    def __call__(self, *args, **kwds):
        key = (tuple(args), tuple(kwds.items()))
        h = hash(key)
        name = '%s/%s_%s.sobj'%(self.__dir, self.__func.func_name, h)

        if os.path.exists(name):
            key2, val = persist.load(name)
            if key == key2:
                # We save and test equality of keys to avoid
                # the (extremely remote) possibility of a hash
                # collision.  Correctness is crucial in mathematics.
                return val

        val = self.__func(*args, **kwds)
        persist.save((key, val), name)
        return val
예제 #4
0
    def __call__(self, *args, **kwds):
        key = (tuple(args), tuple(kwds.items()))
        h = hash(key)
        name = '%s/%s_%s.sobj' % (self.__dir, self.__func.__name__, h)

        if os.path.exists(name):
            key2, val = persist.load(name)
            if key == key2:
                # We save and test equality of keys to avoid
                # the (extremely remote) possibility of a hash
                # collision.  Correctness is crucial in mathematics.
                return val

        val = self.__func(*args, **kwds)
        persist.save((key, val), name)
        return val
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/pca",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            pca = load(f"{self.env['store_misc']}/pca",
                       f'{self.vectorizer_hash}_pca')
            vec = load(f"{self.env['store_misc']}/pca",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/pca",
                     f'{self.vectorizer_hash}_X')

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])

            vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                  max_features=self.max_vocab_f,
                                  strip_accents='unicode')

            # generate term document matrix (model inputs)
            X_tfidf = vec.fit_transform(tokenized_papers)

            # reduce dimensionality of tf-idf features through PCA
            pca = TruncatedSVD(n_components=self.n_components,
                               random_state=self.seed)
            X = pca.fit_transform(X_tfidf)

            save(pca,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_pca',
                 persist=True)
            save(vec,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/pca",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.pca = pca
        self.vectorizer = vec

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column corresponding to the tokens to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            vec = load(f"{self.env['store_misc']}/tfidf",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])

            vec = TfidfVectorizer(max_features=self.max_vocab_f,
                                  strip_accents='unicode')

            # generate term document matrix (model inputs)
            X = vec.fit_transform(tokenized_papers)

            save(vec,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.vectorizer = vec

        if check_persisted(f"{self.env['store_misc']}/embeddings",
                           f'{self.vectorizer_hash}_X', self.load_fresh):
            weighted_embeddings = load(f"{self.env['store_misc']}/embeddings",
                                       f'{self.vectorizer_hash}_X')

        else:
            self.embeddings_model = load_word2vec(self.env['word2vec_model'])
            weighted_embeddings = np.array(self.create_embeddings(X, vec))
            save(weighted_embeddings,
                 f"{self.env['store_misc']}/embeddings",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            #initiate random forest model
            classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.seed,
                n_jobs=self.n_jobs).fit(weighted_embeddings, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
def run_pipeline(ignition_file, persist_all, load_all_fresh):
    """
    An adhoc pipeline created to mirror the standard ML pipeline and work
    with citations data.

    Parameters:
    ===========
    ignition_file: string
        name of the yaml file for which you want to run an experiment

    persist_all: boolean
        T if you want to persist all data for future use

    load_all_fresh: boolean
        T if you want to avoid any persisted data and load new data from scrath

    Returns:
    ========
    None
    """

    model_parts = {}

    ##### 1. LOAD ENVIRONMENT DATA #####

    # load local paths
    local_paths_env = load_local_paths('local_paths.yaml')
    print('Local paths loaded.')

    # load ignition file
    ignition = load_config(local_paths_env['ignition_path'] + ignition_file)
    print('Ignition loaded.')

    # id used for persisting
    hash_id = create_hash_id(str(ignition['id']))
    print('Hash id created.')

    # create hyperparameter combinations (for k-folding)
    hyperparameters = expand_grid(ignition['hyperparameters'])

    # load environment file
    psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path'])
    print('PSQL environment file loaded.')

    # Initiate PSQL Connection
    connection = SQLConn(psql_env)
    connection.open()

    ##### 2. LOAD TRAIN AND TEST DATA #####

    if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x',
                       load_all_fresh):

        print("Found data")

        # data loaded before: load from file
        X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x')
        X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x')
        y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y')
        y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y')

        print('Loaded data from file.')

    else:

        print("Data not found in storage - load from database")

        # data not loaded: pull from database and create features
        X_train, X_test, y_train, y_test = sample(
            ignition, connection, local_paths_env['store_features'])
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # add fold index column to data
        X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'],
                                  ignition['k_folds_seed'])

        # save data to file for future use
        save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x',
             persist_all)
        save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x',
             persist_all)
        save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y',
             persist_all)
        save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y',
             persist_all)

    print('Data loading completed.')

    ##### 3. K-FOLDING #####

    # loop over folds
    for fold in tqdm(range(ignition['k_folds']), desc='Folds'):

        # get fold id hash (for persisting)
        fold_id = create_hash_id(str(ignition['id']) + str(fold))

        # get fold data
        fold_X_train = X_train[X_train['k'] != fold]
        fold_X_test = X_train[X_train['k'] == fold]
        fold_y_train = y_train[y_train['k'] != fold]
        fold_y_test = y_train[y_train['k'] == fold]

        # store fold features, if any
        fold_features = {}

        ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER #####

        for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'):

            # create hyperparam unique id and hyperparam-fold unique id
            hyperparam_id = create_hash_id(
                str(ignition['id']) + str(hyperparam))
            hyperparam_fold_id = create_hash_id(
                str(ignition['id']) + str(hyperparam) + str(fold))

            # if not check_val_in_db(connection, ignition['results_table_name'],
            # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])):

            # create classifier of specified type and with specified target
            classifier = select_classifier(ignition["model_type"],
                                           fold_id,
                                           ignition["target"],
                                           ignition["classes"],
                                           fold_features,
                                           hyperparameters=hyperparam,
                                           seed=ignition['seed'],
                                           env=local_paths_env,
                                           load_fresh=load_all_fresh)
            #print('Classifier created.')

            # train classifier
            classifier.train(fold_X_train, fold_y_train)

            ##### 5. TEST CLASSIFIER #####

            # generate predictions from classifier
            y_probs = classifier.predict(fold_X_test)

            ##### 6. EVALUATION #####

            for recall in tqdm(ignition['recalls'], desc='Evaluations'):

                # compute evaluation metrics
                all_metrics = compute_metrics(
                    metric_names=ignition['metrics'],
                    y_true=fold_y_test.drop(columns=['k']),
                    y_pred=y_probs,
                    k=recall)

                # store results in database
                unique_id = create_hash_id(
                    str(ignition['id']) + str(hyperparam) + str(fold) +
                    str(recall))

                results_to_db(metrics=all_metrics,
                              table_name=ignition['results_table_name'],
                              ignition_id=ignition['id'],
                              hash_id=hyperparam_fold_id,
                              algorithm=ignition['model_type'],
                              hyperparameters=hyperparam,
                              fold=str(fold),
                              recall=recall,
                              unique_id=unique_id,
                              connection=connection)

    connection.close()
    print(f"Done running pipeline for ignition id: {ignition['id']}!")
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column for the token to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')
                X = hstack(
                    [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X])

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode')

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)
                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)
                X = hstack(
                    [csr_matrix(x_train.drop(self.tokens_col, axis=1)), X])

            self.vectorizer = vec

        else:
            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # Create data structure for XGBoost
            data_dmatrix = xgb.DMatrix(data=X, label=labels)

            # creating parameters for xgboost
            params = {
                'objective': self.objective,
                'learning_rate': self.learning_rate,
                'max_depth': self.max_depth,
                'subsample': self.subsample,
                'colsample_bytree': self.colsample_bytree,
                'n_estimators': self.n_estimators,
                'objective': self.objective,
                'gamma': self.gamma,
                'alpha': self.l1,
                'lambda': self.l2
            }
            # xgboost
            self.models[review_group] = xgb.train(params, data_dmatrix)
예제 #9
0
 def load(self):
     try:
         UserData.data = persist.load(self.persist)
     except:
         UserData.data = {}
예제 #10
0
    def train(self, x_train, y_train):
        """
        Trains one classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with a column for the token to use.

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')
                #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X])

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode')

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)
                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)
                #X = hstack([csr_matrix(x_train.drop(self.tokens_col, axis=1)),X])

            self.vectorizer = vec

        else:
            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # Create data structure for light gbm
            data_dmatrix = lgb.Dataset(data=X, label=labels)

            # creating parameters for light gbm
            params = {
                'task': self.task,
                'application': self.application,
                'num_iterations': self.num_iterations,
                'num_leaves': self.num_leaves,
                'device': self.device,
                'min_data_in_leaf': self.min_data_in_leaf,
                'feature_fraction': self.feature_fraction,
                'bagging_fraction': self.bagging_fraction,
                'min_gain_to_split': self.min_gain_to_split,
                'num_threads': self.num_threads,
                'max_depth': self.max_depth,
                'verbosity': -1
            }
            # light gbm
            self.models[review_group] = lgb.train(params, data_dmatrix)
예제 #11
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if data has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

            # check for vectorizers
            if self.tokens_col is not None:
                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                self.vectorizer = vec

            if self.tokens_col2 is not None:
                vec2 = load(f"{self.env['store_misc']}/tfidf",
                            f'{self.vectorizer_hash}_vec2')
                self.vectorizer2 = vec2

        else:

            if self.tokens_col is not None:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])
                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode',
                                      token_pattern=self.token_pattern,
                                      min_df=self.min_df)

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)

                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                self.vectorizer = vec

            if self.tokens_col2 is not None:

                tokenized_papers2 = x_train[self.tokens_col2].apply(
                    lambda x: np.str_(x))
                vec2 = TfidfVectorizer(ngram_range=self.ngram_range,
                                       max_features=self.max_vocab_f2,
                                       strip_accents='unicode',
                                       token_pattern=self.token_pattern,
                                       min_df=self.min_df,
                                       decode_error='ignore')

                X2 = vec2.fit_transform(tokenized_papers2)

                try:
                    X = hstack([X, X2])
                except:
                    X = X2

                save(vec2,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec2',
                     persist=True)
                self.vectorizer2 = vec2

            if self.citations_cols is not None:

                X3 = csr_matrix(x_train[self.citations_cols].values)

                try:
                    X = hstack([X, X3])
                except:
                    X = X3

            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #12
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with as columns (at least):
                tokens_baseline - previously tokenized title-abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if vectorizer has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_X', self.load_fresh):

            vec = load(f"{self.env['store_misc']}/tfidf",
                       f'{self.vectorizer_hash}_vec')
            X = load(f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X')

            if self.tokens_col2 is not None:
                vec2 = load(f"{self.env['store_misc']}/tfidf",
                            f'{self.vectorizer_hash}_vec2')
                self.vectorizer2 = vec2

        else:

            # get the tokenized papers
            tokenized_papers = list(x_train[self.tokens_col])
            vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                  max_features=self.max_vocab_f,
                                  strip_accents='unicode',
                                  token_pattern=self.token_pattern,
                                  min_df=self.min_df)

            # generate term document matrix (model inputs)
            X = vec.fit_transform(tokenized_papers)

            if self.tokens_col2 is not None:

                tokenized_papers2 = x_train[self.tokens_col2].apply(
                    lambda x: np.str_(x))
                vec2 = TfidfVectorizer(ngram_range=self.ngram_range,
                                       max_features=self.max_vocab_f2,
                                       strip_accents='unicode',
                                       token_pattern=self.token_pattern,
                                       min_df=self.min_df,
                                       decode_error='ignore')

                X2 = vec2.fit_transform(tokenized_papers2)
                X = hstack([X, X2])

                save(vec2,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec2',
                     persist=True)
                self.vectorizer2 = vec2

            save(vec,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_vec',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/tfidf",
                 f'{self.vectorizer_hash}_X',
                 persist=True)

        self.vectorizer = vec

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            #initiate random forest model
            classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.seed,
                n_jobs=self.n_jobs).fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #13
0
    def train(self, x_train, y_train):
        """
        Trains one elastic logistic classifier per review group. Saves the trained
        classifiers within self.models.

        Parameters
        ==========

        x_train : pandas DataFrame
            DataFrame containing the papers we aim to
            classify, with at least the column:
                average_embeddings - average word embeddings for concatenated title and abstract

        y_train : pandas DataFrame
            DataFrame containing the labels for each paper. Each
            column represents one review group with binary labels.
        """

        # check if data has been created before, if so load from file
        if check_persisted(f"{self.env['store_misc']}/pca",
                           f'{self.fold_hash}_X', self.load_fresh):

            pca = load(f"{self.env['store_misc']}/pca",
                       f'{self.fold_hash}_pca')
            X = load(f"{self.env['store_misc']}/pca", f'{self.fold_hash}_X')

        else:

            # convert input to format for classifier
            list_of_embeddings = list(x_train[self.embeddings_col])
            x_train = np.array(
                [[float(i) for i in embedding.strip('[]').split()]
                 for embedding in list_of_embeddings])

            # reduce dimensionality of embeddings through PCA
            pca = PCA(n_components=self.n_components, random_state=self.seed)
            X = pca.fit_transform(x_train)

            save(pca,
                 f"{self.env['store_misc']}/pca",
                 f'{self.fold_hash}_pca',
                 persist=True)
            save(X,
                 f"{self.env['store_misc']}/pca",
                 f'{self.fold_hash}_X',
                 persist=True)

        self.pca = pca

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # logistic classifier
            classifier = SGDClassifier(loss="log",
                                       alpha=self.alpha,
                                       l1_ratio=self.l1_ratio,
                                       penalty="elasticnet").fit(X, labels)

            # save the model in dictionary of models
            self.models[review_group] = classifier
예제 #14
0
파일: main.py 프로젝트: nagyben/rpi-brew
from flask import Flask, jsonify
import subprocess

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
appformatter = logging.Formatter(
    '[%(asctime)s] - %(name)s - %(levelname)s - %(message)s')
consolehandler = logging.StreamHandler()
consolehandler.setFormatter(appformatter)
log.addHandler(consolehandler)

SETTINGS_FILE = 'persist.json'
SENSOR_UPDATE_INTERVAL = 5

# ------------------------------------------------------------------------- SETTINGS
persist.load(SETTINGS_FILE)

# ------------------------------------------------------------------------- BREWERY
sensors = [
    TemperatureSensor("red"),
    TemperatureSensor("blue"),
    TemperatureSensor("green")
]

controller = TemperatureController(control_pin=11, time_period=10)

mode = "IDLE"
prep_start_time = None
boil_start_time = None
mash_start_time = None
ferment_start_time = None
예제 #15
0
    def train(self, x_train, y_train):
        """
        Trains classifier for each review group and stores
        it in a dictionary that is a class attribute.

        Parameters
        ==========
        x_train : pd.DataFrame
            Dataframe with columns corresponding to features to include in the model.
        y_train : pd.DataFrame
            DataFrame containing the labels for each paper. Each column represents one
            review group with binary labels.

        Returns
        =======
        None
        """
        ### preprocess ###
        if self.tfidf:

            # check if vectorizer has been created before, if so load from file
            if check_persisted(f"{self.env['store_misc']}/tfidf",
                               f'{self.vectorizer_hash}_X', self.load_fresh):

                vec = load(f"{self.env['store_misc']}/tfidf",
                           f'{self.vectorizer_hash}_vec')
                X = load(f"{self.env['store_misc']}/tfidf",
                         f'{self.vectorizer_hash}_X')

            else:

                # get the tokenized papers
                tokenized_papers = list(x_train[self.tokens_col])

                vec = TfidfVectorizer(ngram_range=self.ngram_range,
                                      max_features=self.max_vocab_f,
                                      strip_accents='unicode',
                                      token_pattern=self.token_pattern,
                                      min_df=self.min_df)

                # generate term document matrix (model inputs)
                X = vec.fit_transform(tokenized_papers)

                save(vec,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_vec',
                     persist=True)
                save(X,
                     f"{self.env['store_misc']}/tfidf",
                     f'{self.vectorizer_hash}_X',
                     persist=True)

            self.vectorizer = vec

        else:

            X = x_train

        # discard fold ID column from labels
        review_groups = [col for col in y_train.columns if not col == 'k']

        # train
        for review_group in tqdm(review_groups, desc='Train Review Groups'):

            # pull label column
            labels = y_train[review_group]

            # creating parameters for xgboost
            classifier = AdaBoostClassifier(base_estimator=self.base_estimator,
                                            n_estimators=self.n_estimators,
                                            learning_rate=self.learning_rate,
                                            algorithm=self.algorithm,
                                            random_state=self.seed).fit(
                                                X, labels)

            # save classifier to class attribute
            self.models[review_group] = classifier
예제 #16
0
def perform_model_selection(evaluate_best_models=True):
    """
    Run model selection pipeline.
    """
    # Load local paths file
    local_paths = load_local_paths('local_paths.yaml')

    # Load product config file
    prod_config = load_config('../prod/prod_config.yaml', append_static=False)

    # SQL set up
    psql_env = load_psql_env(pgpass_path=local_paths['pgpass_path'])
    connection = SQLConn(psql_env)
    connection.open()

    # Pull data
    X_train, X_test, y_train, y_test = sample(
        ignition=prod_config,
        connection=connection,
        local_features_path=local_paths['store_features'])

    if evaluate_best_models:

        # Test best models for each review group
        scored_papers_test = load(location=local_paths['store_scored_papers'],
                                  filename='scored_papers')

        y_pred_test = scored_papers_test[[
            col for col in scored_papers_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]
        y_test = y_test[[
            col for col in y_test.columns
            if col.upper() in prod_config['review_groups_recall'].keys()
        ]]

        # calculate thresholds
        upper_thresholds, lower_thresholds, recall_at_thresholds = get_thresholds(
            y_test, y_pred_test, minimum_recall=0.99)

        # persist thresholds for production
        save(upper_thresholds, local_paths['store_production_models'],
             'upper_thresholds')
        save(lower_thresholds, local_paths['store_production_models'],
             'lower_thresholds')

        # calculate workload reductions
        keep, consider, discard = get_workload_reduction(
            y_test, y_pred_test, upper_thresholds, lower_thresholds)

        rg_list = []
        wrkld_reductions = []

        # loop over review groups
        for review_group in tqdm(prod_config['review_groups_recall'].keys(),
                                 desc='Review Group'):

            rg = review_group.lower()

            # get thresholds
            thresholds = [upper_thresholds[rg], lower_thresholds[rg]]
            recall_at_threshold = [
                recall_at_thresholds[rg]['upper'],
                recall_at_thresholds[rg]['lower']
            ]
            workload_reduction = [keep[rg], consider[rg], discard[rg]]

            rg_list.append(rg)
            wrkld_reductions.append(workload_reduction)

        d = {'review_group': rg_list, 'workload_reduction': wrkld_reductions}
        df = pd.DataFrame.from_dict(d)
        plot_average_workload_reduction(df)

    connection.close()

    print("Model selection pipeline complete.")
예제 #17
0
 def load(self):
     try:
         FaceData.data = persist.load(self.persist)
     except:
         FaceData.data = {}