def __init__(self, fold_hash, ngram_range=(1, 3), max_vocab_f=75000, alpha=0.0001, min_df=3, tokens_col="tokens_baseline", env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = defaultdict(dict) self.vectorizer = None # parameters for tf-idf vectorizer self.ngram_range = ngram_range self.max_vocab_f = max_vocab_f self.min_df = min_df self.tokens_col = tokens_col self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}' ) # parameter for logistic regression self.alpha = alpha self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, target=None, nthread=1, tfidf=False, ngram_range=3, max_vocab_f=75000, tokens_col="tokens_baseline", task="train", application="binary", num_iterations=100, num_leaves=31, device="cpu", min_data_in_leaf=20, feature_fraction=1, bagging_fraction=1, min_gain_to_split=0.1, num_threads=0, max_depth=100, token_pattern='alpha', env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = defaultdict(dict) self.vectorizer = None # parameters for tf-idf vectorizer self.tfidf = tfidf self.ngram_range = (1, ngram_range) self.max_vocab_f = max_vocab_f self.tokens_col = tokens_col self.token_pattern = token_pattern self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}' ) # parameter for LightGBM self.task = task self.application = application self.num_iterations = num_iterations self.num_leaves = num_leaves self.device = device self.min_data_in_leaf = min_data_in_leaf self.feature_fraction = feature_fraction self.bagging_fraction = bagging_fraction self.min_gain_to_split = min_gain_to_split self.num_threads = num_threads self.max_depth = max_depth self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, seed, ngram_range=3, max_vocab_f=75000, max_vocab_f2=100, min_df=3, n_estimators=10, max_depth=None, max_features='auto', n_jobs=3, min_samples_split=2, min_samples_leaf=1, tokens_col="tokens_no_stopwords", tokens_col2=None, token_pattern="default", env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = {} self.vectorizer = None self.vectorizer2 = None # parameters for tf-idf vectorizer self.ngram_range = (1, ngram_range) self.max_vocab_f = max_vocab_f self.max_vocab_f2 = max_vocab_f2 self.min_df = min_df self.tokens_col = tokens_col self.tokens_col2 = tokens_col2 if token_pattern == 'alpha': self.token_pattern = r'(?u)\b[A-Za-z]+\b' else: self.token_pattern = r'(?u)\b\w\w+\b' self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}' ) # parameter for model self.seed = seed self.n_estimators = n_estimators self.max_depth = max_depth self.max_features = max_features self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.n_jobs = n_jobs self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, target=None, nthread=1, tfidf=False, ngram_range=(1, 3), max_vocab_f=75000, tokens_col="tokens_baseline", learning_rate=0.1, max_depth=5, subsample=1, colsample_bytree=1, n_estimators=10, objective="binary:logistic", gamma=None, l1=0, l2=1, env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = defaultdict(dict) self.vectorizer = None # parameters for tf-idf vectorizer self.tfidf = tfidf self.ngram_range = ngram_range self.max_vocab_f = max_vocab_f self.tokens_col = tokens_col self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}' ) # parameter for XGBoost self.learning_rate = learning_rate self.max_depth = max_depth self.subsample = subsample self.colsample_bytree = colsample_bytree self.n_estimators = n_estimators self.objective = objective self.gamma = gamma self.l1 = l1 self.l2 = l2 self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, ngram_range=3, max_vocab_f=75000, max_vocab_f2=100, alpha=0.0001, l1_ratio=0.15, min_df=3, tokens_col=None, tokens_col2=None, token_pattern='default', citations_cols=None, env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = defaultdict(dict) self.vectorizer = None self.vectorizer2 = None # parameters for tf-idf vectorizer self.ngram_range = (1, ngram_range) self.max_vocab_f = max_vocab_f self.max_vocab_f2 = max_vocab_f2 self.min_df = min_df self.tokens_col = tokens_col self.tokens_col2 = tokens_col2 if token_pattern == 'alpha': self.token_pattern = r'(?u)\b[A-Za-z]+\b' else: self.token_pattern = r'(?u)\b\w\w+\b' self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}' ) # parameter for logistic regression with elastic net self.alpha = alpha self.l1_ratio = l1_ratio # parameters for citations self.citations_cols = citations_cols self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, seed, env, ngram_range=(1, 1), max_vocab_f=75000, min_df=3, n_estimators=10, max_depth=None, max_features='auto', n_jobs=8, min_samples_split=2, min_samples_leaf=1, tokens_col="tokens_no_stopwords", load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = {} self.vectorizer = None # parameters for tf-idf vectorizer self.ngram_range = ngram_range self.max_vocab_f = max_vocab_f self.min_df = min_df self.tokens_col = tokens_col self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}' ) # parameter for model self.seed = seed self.n_estimators = n_estimators self.max_depth = max_depth self.max_features = max_features self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.n_jobs = n_jobs self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', seed=None, tfidf=False, ngram_range=3, max_vocab_f=75000, min_df=3, tokens_col="tokens_baseline", token_pattern="default", env=None, load_fresh=False): # Persist parameters self.fold_hash = fold_hash # AdaBoost parameters self.base_estimator = base_estimator self.n_estimators = n_estimators self.learning_rate = learning_rate self.algorithm = algorithm self.seed = seed # Features parameters self.tfidf = tfidf self.ngram_range = (1, ngram_range) self.max_vocab_f = max_vocab_f self.min_df = min_df self.tokens_col = tokens_col self.token_pattern = token_pattern self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.tokens_col}' ) # Models dictionary self.models = {} self.env = env self.load_fresh = load_fresh
def __init__(self, fold_hash, seed, ngram_range=(1, 2), max_vocab_f=10000, alpha=0.0001, l1_ratio=0.15, min_df=3, tokens_col="tokens_baseline", n_components=1000, env=None, load_fresh=False): # identify which fold model was trained on self.fold_hash = fold_hash # model contents self.models = defaultdict(dict) self.pca = None # parameters for tf-idf vectorizer self.ngram_range = ngram_range self.max_vocab_f = max_vocab_f self.min_df = min_df self.tokens_col = tokens_col self.vectorizer = None self.vectorizer_hash = create_hash_id( f'{self.fold_hash}{self.ngram_range}{self.max_vocab_f}{self.min_df}{self.tokens_col}' ) # parameter for logistic regression with elastic net self.alpha = alpha self.l1_ratio = l1_ratio # parameter for pca self.n_components = n_components self.seed = seed self.env = env self.load_fresh = load_fresh
def run_pipeline(ignition_file, persist_all, load_all_fresh): """ An adhoc pipeline created to mirror the standard ML pipeline and work with citations data. Parameters: =========== ignition_file: string name of the yaml file for which you want to run an experiment persist_all: boolean T if you want to persist all data for future use load_all_fresh: boolean T if you want to avoid any persisted data and load new data from scrath Returns: ======== None """ model_parts = {} ##### 1. LOAD ENVIRONMENT DATA ##### # load local paths local_paths_env = load_local_paths('local_paths.yaml') print('Local paths loaded.') # load ignition file ignition = load_config(local_paths_env['ignition_path'] + ignition_file) print('Ignition loaded.') # id used for persisting hash_id = create_hash_id(str(ignition['id'])) print('Hash id created.') # create hyperparameter combinations (for k-folding) hyperparameters = expand_grid(ignition['hyperparameters']) # load environment file psql_env = load_psql_env(pgpass_path=local_paths_env['pgpass_path']) print('PSQL environment file loaded.') # Initiate PSQL Connection connection = SQLConn(psql_env) connection.open() ##### 2. LOAD TRAIN AND TEST DATA ##### if check_persisted(local_paths_env['store_train_data'], f'{hash_id}_x', load_all_fresh): print("Found data") # data loaded before: load from file X_train = load(local_paths_env['store_train_data'], f'{hash_id}_x') X_test = load(local_paths_env['store_test_data'], f'{hash_id}_x') y_train = load(local_paths_env['store_train_data'], f'{hash_id}_y') y_test = load(local_paths_env['store_test_data'], f'{hash_id}_y') print('Loaded data from file.') else: print("Data not found in storage - load from database") # data not loaded: pull from database and create features X_train, X_test, y_train, y_test = sample( ignition, connection, local_paths_env['store_features']) print(f"X_train shape: {X_train.shape}") print(f"X_test shape: {X_test.shape}") print(f"y_train shape: {y_train.shape}") print(f"y_test shape: {y_test.shape}") # add fold index column to data X_train, y_train = k_fold(X_train, y_train, ignition['k_folds'], ignition['k_folds_seed']) # save data to file for future use save(X_train, local_paths_env['store_train_data'], f'{hash_id}_x', persist_all) save(X_test, local_paths_env['store_test_data'], f'{hash_id}_x', persist_all) save(y_train, local_paths_env['store_train_data'], f'{hash_id}_y', persist_all) save(y_test, local_paths_env['store_test_data'], f'{hash_id}_y', persist_all) print('Data loading completed.') ##### 3. K-FOLDING ##### # loop over folds for fold in tqdm(range(ignition['k_folds']), desc='Folds'): # get fold id hash (for persisting) fold_id = create_hash_id(str(ignition['id']) + str(fold)) # get fold data fold_X_train = X_train[X_train['k'] != fold] fold_X_test = X_train[X_train['k'] == fold] fold_y_train = y_train[y_train['k'] != fold] fold_y_test = y_train[y_train['k'] == fold] # store fold features, if any fold_features = {} ##### 4. LOOP OVER HYPERPARAMETERS: TRAIN CLASSIFIER ##### for hyperparam in tqdm(hyperparameters, desc='Hyperparameters'): # create hyperparam unique id and hyperparam-fold unique id hyperparam_id = create_hash_id( str(ignition['id']) + str(hyperparam)) hyperparam_fold_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold)) # if not check_val_in_db(connection, ignition['results_table_name'], # 'results', 'hash_id', hyperparam_fold_id, len(ignition['recalls'])): # create classifier of specified type and with specified target classifier = select_classifier(ignition["model_type"], fold_id, ignition["target"], ignition["classes"], fold_features, hyperparameters=hyperparam, seed=ignition['seed'], env=local_paths_env, load_fresh=load_all_fresh) #print('Classifier created.') # train classifier classifier.train(fold_X_train, fold_y_train) ##### 5. TEST CLASSIFIER ##### # generate predictions from classifier y_probs = classifier.predict(fold_X_test) ##### 6. EVALUATION ##### for recall in tqdm(ignition['recalls'], desc='Evaluations'): # compute evaluation metrics all_metrics = compute_metrics( metric_names=ignition['metrics'], y_true=fold_y_test.drop(columns=['k']), y_pred=y_probs, k=recall) # store results in database unique_id = create_hash_id( str(ignition['id']) + str(hyperparam) + str(fold) + str(recall)) results_to_db(metrics=all_metrics, table_name=ignition['results_table_name'], ignition_id=ignition['id'], hash_id=hyperparam_fold_id, algorithm=ignition['model_type'], hyperparameters=hyperparam, fold=str(fold), recall=recall, unique_id=unique_id, connection=connection) connection.close() print(f"Done running pipeline for ignition id: {ignition['id']}!")