def __init__(self,
                 fold_hash,
                 ngram_range=(1, 3),
                 max_vocab_f=75000,
                 alpha=0.0001,
                 min_df=3,
                 tokens_col="tokens_baseline",
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = defaultdict(dict)
        self.vectorizer = None

        # parameters for tf-idf vectorizer
        self.ngram_range = ngram_range
        self.max_vocab_f = max_vocab_f
        self.min_df = min_df
        self.tokens_col = tokens_col
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}'
        )

        # parameter for logistic regression
        self.alpha = alpha

        self.env = env
        self.load_fresh = load_fresh
예제 #2
0
    def __init__(self,
                 fold_hash,
                 target=None,
                 nthread=1,
                 tfidf=False,
                 ngram_range=3,
                 max_vocab_f=75000,
                 tokens_col="tokens_baseline",
                 task="train",
                 application="binary",
                 num_iterations=100,
                 num_leaves=31,
                 device="cpu",
                 min_data_in_leaf=20,
                 feature_fraction=1,
                 bagging_fraction=1,
                 min_gain_to_split=0.1,
                 num_threads=0,
                 max_depth=100,
                 token_pattern='alpha',
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = defaultdict(dict)
        self.vectorizer = None

        # parameters for tf-idf vectorizer
        self.tfidf = tfidf
        self.ngram_range = (1, ngram_range)
        self.max_vocab_f = max_vocab_f
        self.tokens_col = tokens_col
        self.token_pattern = token_pattern
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}'
        )

        # parameter for LightGBM
        self.task = task
        self.application = application
        self.num_iterations = num_iterations
        self.num_leaves = num_leaves
        self.device = device
        self.min_data_in_leaf = min_data_in_leaf
        self.feature_fraction = feature_fraction
        self.bagging_fraction = bagging_fraction
        self.min_gain_to_split = min_gain_to_split
        self.num_threads = num_threads
        self.max_depth = max_depth

        self.env = env
        self.load_fresh = load_fresh
예제 #3
0
    def __init__(self,
                 fold_hash,
                 seed,
                 ngram_range=3,
                 max_vocab_f=75000,
                 max_vocab_f2=100,
                 min_df=3,
                 n_estimators=10,
                 max_depth=None,
                 max_features='auto',
                 n_jobs=3,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 tokens_col="tokens_no_stopwords",
                 tokens_col2=None,
                 token_pattern="default",
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = {}
        self.vectorizer = None
        self.vectorizer2 = None

        # parameters for tf-idf vectorizer
        self.ngram_range = (1, ngram_range)
        self.max_vocab_f = max_vocab_f
        self.max_vocab_f2 = max_vocab_f2
        self.min_df = min_df
        self.tokens_col = tokens_col
        self.tokens_col2 = tokens_col2
        if token_pattern == 'alpha':
            self.token_pattern = r'(?u)\b[A-Za-z]+\b'
        else:
            self.token_pattern = r'(?u)\b\w\w+\b'
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}'
        )

        # parameter for model
        self.seed = seed
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.n_jobs = n_jobs

        self.env = env
        self.load_fresh = load_fresh
    def __init__(self,
                 fold_hash,
                 target=None,
                 nthread=1,
                 tfidf=False,
                 ngram_range=(1, 3),
                 max_vocab_f=75000,
                 tokens_col="tokens_baseline",
                 learning_rate=0.1,
                 max_depth=5,
                 subsample=1,
                 colsample_bytree=1,
                 n_estimators=10,
                 objective="binary:logistic",
                 gamma=None,
                 l1=0,
                 l2=1,
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = defaultdict(dict)
        self.vectorizer = None

        # parameters for tf-idf vectorizer
        self.tfidf = tfidf
        self.ngram_range = ngram_range
        self.max_vocab_f = max_vocab_f
        self.tokens_col = tokens_col
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}'
        )

        # parameter for XGBoost
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_estimators = n_estimators
        self.objective = objective
        self.gamma = gamma
        self.l1 = l1
        self.l2 = l2

        self.env = env
        self.load_fresh = load_fresh
    def __init__(self,
                 fold_hash,
                 ngram_range=3,
                 max_vocab_f=75000,
                 max_vocab_f2=100,
                 alpha=0.0001,
                 l1_ratio=0.15,
                 min_df=3,
                 tokens_col=None,
                 tokens_col2=None,
                 token_pattern='default',
                 citations_cols=None,
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = defaultdict(dict)
        self.vectorizer = None
        self.vectorizer2 = None

        # parameters for tf-idf vectorizer
        self.ngram_range = (1, ngram_range)
        self.max_vocab_f = max_vocab_f
        self.max_vocab_f2 = max_vocab_f2
        self.min_df = min_df
        self.tokens_col = tokens_col
        self.tokens_col2 = tokens_col2
        if token_pattern == 'alpha':
            self.token_pattern = r'(?u)\b[A-Za-z]+\b'
        else:
            self.token_pattern = r'(?u)\b\w\w+\b'
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}'
        )

        # parameter for logistic regression with elastic net
        self.alpha = alpha
        self.l1_ratio = l1_ratio

        # parameters for citations
        self.citations_cols = citations_cols

        self.env = env
        self.load_fresh = load_fresh
    def __init__(self,
                 fold_hash,
                 seed,
                 env,
                 ngram_range=(1, 1),
                 max_vocab_f=75000,
                 min_df=3,
                 n_estimators=10,
                 max_depth=None,
                 max_features='auto',
                 n_jobs=8,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 tokens_col="tokens_no_stopwords",
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = {}
        self.vectorizer = None

        # parameters for tf-idf vectorizer
        self.ngram_range = ngram_range
        self.max_vocab_f = max_vocab_f
        self.min_df = min_df
        self.tokens_col = tokens_col

        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}'
        )

        # parameter for model
        self.seed = seed
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.n_jobs = n_jobs

        self.env = env
        self.load_fresh = load_fresh
예제 #7
0
    def __init__(self,
                 fold_hash,
                 base_estimator=DecisionTreeClassifier(max_depth=1),
                 n_estimators=50,
                 learning_rate=1.0,
                 algorithm='SAMME.R',
                 seed=None,
                 tfidf=False,
                 ngram_range=3,
                 max_vocab_f=75000,
                 min_df=3,
                 tokens_col="tokens_baseline",
                 token_pattern="default",
                 env=None,
                 load_fresh=False):

        # Persist parameters
        self.fold_hash = fold_hash

        # AdaBoost parameters
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.algorithm = algorithm
        self.seed = seed

        # Features parameters
        self.tfidf = tfidf
        self.ngram_range = (1, ngram_range)
        self.max_vocab_f = max_vocab_f
        self.min_df = min_df
        self.tokens_col = tokens_col
        self.token_pattern = token_pattern
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}'
        )

        # Models dictionary
        self.models = {}

        self.env = env
        self.load_fresh = load_fresh
    def __init__(self,
                 fold_hash,
                 seed,
                 ngram_range=(1, 2),
                 max_vocab_f=10000,
                 alpha=0.0001,
                 l1_ratio=0.15,
                 min_df=3,
                 tokens_col="tokens_baseline",
                 n_components=1000,
                 env=None,
                 load_fresh=False):

        # identify which fold model was trained on
        self.fold_hash = fold_hash

        # model contents
        self.models = defaultdict(dict)
        self.pca = None

        # parameters for tf-idf vectorizer
        self.ngram_range = ngram_range
        self.max_vocab_f = max_vocab_f
        self.min_df = min_df
        self.tokens_col = tokens_col
        self.vectorizer = None
        self.vectorizer_hash = create_hash_id(
            f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}'
        )

        # parameter for logistic regression with elastic net
        self.alpha = alpha
        self.l1_ratio = l1_ratio

        # parameter for pca
        self.n_components = n_components
        self.seed = seed

        self.env = env
        self.load_fresh = load_fresh
def run_pipeline(ignition_file, persist_all, load_all_fresh):
    """
    An adhoc pipeline created to mirror the standard ML pipeline and work
    with citations data.

    Parameters:
    ===========
    ignition_file: string
        name of the yaml file for which you want to run an experiment

    persist_all: boolean
        T if you want to persist all data for future use

    load_all_fresh: boolean
        T if you want to avoid any persisted data and load new data from scrath

    Returns:
    ========
    None
    """

    model_parts = {}

    ##### 1. LOAD ENVIRONMENT DATA #####

    # load local paths
    local_paths_env = load_local_paths('local_paths.yaml')
    print('Local paths loaded.')

    # load ignition file
    ignition = load_config(local_paths_env['ignition_path'] + ignition_file)
    print('Ignition loaded.')

    # id used for persisting
    hash_id = create_hash_id(str(ignition['id']))
    print('Hash id created.')

    # create hyperparameter combinations (for k-folding)
    hyperparameters = expand_grid(ignition['hyperparameters'])

    # load environment file
    psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path'])
    print('PSQL environment file loaded.')

    # Initiate PSQL Connection
    connection = SQLConn(psql_env)
    connection.open()

    ##### 2. LOAD TRAIN AND TEST DATA #####

    if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x',
                       load_all_fresh):

        print("Found data")

        # data loaded before: load from file
        X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x')
        X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x')
        y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y')
        y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y')

        print('Loaded data from file.')

    else:

        print("Data not found in storage - load from database")

        # data not loaded: pull from database and create features
        X_train, X_test, y_train, y_test = sample(
            ignition, connection, local_paths_env['store_features'])
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # add fold index column to data
        X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'],
                                  ignition['k_folds_seed'])

        # save data to file for future use
        save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x',
             persist_all)
        save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x',
             persist_all)
        save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y',
             persist_all)
        save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y',
             persist_all)

    print('Data loading completed.')

    ##### 3. K-FOLDING #####

    # loop over folds
    for fold in tqdm(range(ignition['k_folds']), desc='Folds'):

        # get fold id hash (for persisting)
        fold_id = create_hash_id(str(ignition['id']) + str(fold))

        # get fold data
        fold_X_train = X_train[X_train['k'] != fold]
        fold_X_test = X_train[X_train['k'] == fold]
        fold_y_train = y_train[y_train['k'] != fold]
        fold_y_test = y_train[y_train['k'] == fold]

        # store fold features, if any
        fold_features = {}

        ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER #####

        for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'):

            # create hyperparam unique id and hyperparam-fold unique id
            hyperparam_id = create_hash_id(
                str(ignition['id']) + str(hyperparam))
            hyperparam_fold_id = create_hash_id(
                str(ignition['id']) + str(hyperparam) + str(fold))

            # if not check_val_in_db(connection, ignition['results_table_name'],
            # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])):

            # create classifier of specified type and with specified target
            classifier = select_classifier(ignition["model_type"],
                                           fold_id,
                                           ignition["target"],
                                           ignition["classes"],
                                           fold_features,
                                           hyperparameters=hyperparam,
                                           seed=ignition['seed'],
                                           env=local_paths_env,
                                           load_fresh=load_all_fresh)
            #print('Classifier created.')

            # train classifier
            classifier.train(fold_X_train, fold_y_train)

            ##### 5. TEST CLASSIFIER #####

            # generate predictions from classifier
            y_probs = classifier.predict(fold_X_test)

            ##### 6. EVALUATION #####

            for recall in tqdm(ignition['recalls'], desc='Evaluations'):

                # compute evaluation metrics
                all_metrics = compute_metrics(
                    metric_names=ignition['metrics'],
                    y_true=fold_y_test.drop(columns=['k']),
                    y_pred=y_probs,
                    k=recall)

                # store results in database
                unique_id = create_hash_id(
                    str(ignition['id']) + str(hyperparam) + str(fold) +
                    str(recall))

                results_to_db(metrics=all_metrics,
                              table_name=ignition['results_table_name'],
                              ignition_id=ignition['id'],
                              hash_id=hyperparam_fold_id,
                              algorithm=ignition['model_type'],
                              hyperparameters=hyperparam,
                              fold=str(fold),
                              recall=recall,
                              unique_id=unique_id,
                              connection=connection)

    connection.close()
    print(f"Done running pipeline for ignition id: {ignition['id']}!")