def run_tfidf(fold, args): """Wrapper function that runs the process of TFIDF after preprocessing Parameters ---------- fold : dict id_llamado lists to filter features and labels args : dict dictionary of parameters to be passed into the TFIDF algo Returns ------- pd.DataFrame train and test dataframes for train and test document features """ fold_id = {'fold_id': generate_id(str(fold) + str(args['params']))} if check_if_local_exists(fold_id, 'tfidf-train', ['fold_id']): tfidf_features_train = get_local(fold_id, 'tfidf-train', id_keys=['fold_id'], as_type='.parquet.gz') tfidf_features_test = get_local(fold_id, 'tfidf-test', id_keys=['fold_id'], as_type='.parquet.gz') else: # Get the processed list of texts for both train and test train_id, train_text = tfidf_preprocess(fold['train']) test_id, test_text = tfidf_preprocess(fold['test']) # Get train and test document features sets stop_words = set(stopwords.words('spanish')) # Get TFIDF encoder tfidf_encode = vector_fit(train_text, args['params'], stop_words) # Get train and test dataframes tfidf_features_train = vector_transform(train_text, train_id, tfidf_encode) tfidf_features_test = vector_transform(test_text, test_id, tfidf_encode) persist_local(tfidf_encode, args, 'tfidf', ['experiment_id'], as_type='.p') persist_local(tfidf_features_train, fold_id, 'tfidf-train', ['fold_id']) persist_local(tfidf_features_test, fold_id, 'tfidf-test', ['fold_id']) return tfidf_features_train, tfidf_features_test
def create_labels(args): """ Function to obtain a dataframe of labels from experiment file corresponding to cohort Parameters ---------- experiment: dict Experiment file with model parameters Return --------- pd.DataFrame Dataframe of IDs and labels """ experiment = get_experiment(args['experiment_id']) features = get_local(args, 'features')['id_llamado'] query =""" select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, labels.reception_date, {label_target} as target from semantic.labels labels join semantic.tenders tenders on labels.id_llamado = tenders.id_llamado where labels.id_llamado in ({cohort}) """.format(cohort=experiment['cohort_config']['query'], label_target=experiment['label_config']['query']) con = utils.connect_to_database() labels = pd.read_sql_query(query, con) labels = labels[labels['id_llamado'].isin(features)] persist_local(labels, args, 'labels')
def generate_temporal_folds(experiment, args): """ Given a label table and temporal parameters, it generates temporal folds. Parameters ---------- experiment : dict Paramenters of to perform the experiment args : dict Minimum set of parameters to run the pipeline Returns ------- list of dicts All folds information, by as of date. It carries the ids to filter the tables """ params = experiment['validation']['parameters'] current_aod = dt.datetime.strptime(params['as_of_date'], '%Y-%m-%d') labels = get_local(args, 'labels') X = labels[['id_llamado', 'reception_date']] y = labels['target'] k = 1 folds = [] while True: test_end = current_aod + dt.timedelta(days=params['test_lag']) if test_end > dt.datetime.strptime( params['test_date_limit'], '%Y-%m-%d'): break if params['number_of_folds'] is not None: if k > params['number_of_folds']: break # If train_lag is 'all', train_start is set to a dummy old date # (2000-01-01) train_start = (current_aod - dt.timedelta(days=params['train_lag']) - dt.timedelta(days=params['blind_gap'])) \ if params['train_lag'] != 'all' else dt.datetime(2000, 1, 1) train_end = current_aod - dt.timedelta(days=params['blind_gap']) train_ids = X.query( f"reception_date >= '{train_start}' and reception_date <= '{train_end}'")['id_llamado'] test_ids = X.query( f"reception_date >= '{current_aod}' and reception_date <= '{test_end}'")['id_llamado'] folds.append({ 'name': dt.datetime.strftime(current_aod, '%Y-%m-%d'), 'train': train_ids.tolist(), 'test': test_ids.tolist() }) current_aod = current_aod + dt.timedelta(days=params['aod_lag']) k = k + 1 persist_local(folds, args, 'folds', as_type='.p') return folds
def generate_kfolds(experiment, args): """ Given a label table, it generates random stratified folds. Parameters ---------- experiment : dict Paramenters of to perform the experiment args : dict Minimum set of parameters to run the pipeline Returns ------- list of dicts All folds information. It carries the ids to filter the tables """ skf = StratifiedKFold(n_splits=experiment['validation']['parameters']['number_of_folds'], random_state=experiment['model_config']['random_seed']) labels = get_local(args, 'labels') X = labels['id_llamado'] y = labels['target'] folds = [] for i, index in enumerate(skf.split(X, y)): folds.append({ 'name': i, 'train': X[index[0]].tolist(), 'test': X[index[1]].tolist() }) return folds
def get_data(selected_learners): final = [] for args in selected_learners.to_dict('records'): args['fold_name'] = '2017-12-26' try: final.append(dict( learner_id=args['learner_id'], experiment_id=args['experiment_id'], results=get_local(args, 'predictions', id_keys=['experiment_id', 'approach_id', 'learner_id', 'fold_name'])\ .merge(get_local(args, 'labels').set_index('id_llamado'), left_index=True, right_index=True)[['reception_date', 'prediction', 'target']] )) except: print('debug') return final
def save_model(production_path, ids): model = get_local(ids.to_dict(), folder='models', id_keys=['experiment_id', 'approach_id', 'learner_id'], as_type='.p') persist_local( model, args={ **ids.to_dict(), **{ 'preffix': 'model' } }, folder=None, id_keys=['preffix', 'experiment_id', 'approach_id', 'learner_id'], as_type='.p', save_path=production_path)
def complaints_per_fold(args, data): folds = pickle.load( open(f'/data/persist/folds/{args["experiment_id"]}.p', 'rb')) labels = get_local(args, 'labels').set_index('id_llamado')[['target']] i = 0 complaints = [] for fold in folds: if fold['name'] in list(data['fold'].unique()): complaints.append({ 'complaints': 100 * labels.loc[fold['test']].sum().values[0] / len(labels.loc[fold['test']]), 'fold': fold['name'] }) return pd.DataFrame(complaints)
def save_preprocessor(production_path, ids, max_fold): content = get_local({ **ids.to_dict(), **{ 'fold': max_fold } }, folder='preprocessing', id_keys=['experiment_id', 'approach_id', 'fold'], as_type='.dill') persist_local(content, args={ **ids.to_dict(), **{ 'fold': max_fold, 'preffix': 'prepro' } }, folder=None, id_keys=['preffix', 'experiment_id', 'approach_id', 'fold'], as_type='.dill', save_path=production_path)
""" good_learners.index[1] df = pd.read_sql_query(query.format(learner_id=good_learners.index[0]), con) # %matplotlib inline import numpy as np df['importance_log'] = df['importance'].apply(lambda x: np.log10(x) + 1) df df.head(20).plot( x='feature', y='importance_log', kind='barh', ) df.head(10) # ### Features from pipeline.data_persistence import get_local features = get_local(args, 'features').set_index('id_llamado') features test('a')
groups: CD: director1 CO: director2 LPN: director3 LPI: director3 LC: director3 CE: director3 """ evaluation = yaml.load(evaluation) experiment_id = 7136875758 features = get_local({ 'experiment_id': experiment_id }, 'features').set_index('id_llamado') labels = get_local({ 'experiment_id': experiment_id }, 'labels').set_index('id_llamado') length = 10000 labels = labels.sort_values(by='reception_date', ascending=False) predictions = labels[:length][[]] predictions['prediction'] = [random.random() for i in range(length)] observertions = labels[:length] # + def classifing(df, args):
import pipeline.preprocessing as pr import pipeline.pipeline as pi import pipeline.data_persistence as dp import pipeline.model_data_prep as mp import utils.utils as utils from sklearn.preprocessing import StandardScaler, OneHotEncoder scaler = StandardScaler() from sklearn import preprocessing args = dict(experiment_id=3113067571) folds = mp.create_folds(args) features = dp.get_local(args, 'features').set_index('id_llamado') labels = dp.get_local(args, 'labels').set_index('id_llamado')[['target']] approaches = dp.get_approaches(args['experiment_id']) approach = approaches[0] # + def split_by_type(data, numeric_types=['int64', 'float64'], object_types=['object']): return { 'numeric': data.select_dtypes(numeric_types), 'object': data.select_dtypes(object_types),
def get_features(ids): return list( get_local({ 'experiment_id': ids['experiment_id'] }, 'features').set_index('id_llamado').columns)
def loop_the_grid(args): """ Given the experiment file with experiment parameters, the list of temporal_folds as well as the data dictionary prepared by the model_data_prep function, the function loops through the various temporal folds and the list of approaches specified in the experiment file to calculate metrics specified in the experiment file. Parameters ---------- args: dictionary Minimum set of arguments to start functions. """ experiment = get_experiment(args['experiment_id']) approaches = get_approaches(args['experiment_id']) features = get_local(args, 'features').set_index('id_llamado') labels = get_local(args, 'labels').set_index('id_llamado') #Check if textprocessing is needed: if 'textprocessing' in experiment: args_tfidf = {} args_tfidf['params'] = experiment['textprocessing']['tfidf'] args_tfidf['experiment_id'] = args['experiment_id'] else: args_tfidf = {} print('Approaches: ', ', '.join([k['name'] for k in approaches])) for fold in tqdm(args['folds'], desc='Folds'): args['fold_name'] = fold['name'] original_train_dict, original_test_dict = generate_folds_matrices( features, labels, fold, args_tfidf) for approach in tqdm(approaches, desc='Approaches'): args['approach_id'] = approach['approach_id'] args['approach_name'] = approach['name'] train_dict, test_dict = \ apply_preprocessing(approach, original_train_dict, original_test_dict, args) for hyperparameters in tqdm(generate_hyperparameters_combinations( approach['hyperparameters']), desc='Hyper'): args['hyperparameters'] = hyperparameters args = persist_learner(args) try: max_run_time(experiment['model_config']['max_seconds']) mod = importlib.import_module( f"pipeline.approaches.{approach['python_path'][:-3]}") model = mod.fit(args, train_dict=train_dict) predictions = mod.predict( model, test_features=test_dict['features']) evaluations = evaluate(obs=test_dict['labels'], pred=predictions, evaluation=experiment['evaluation']) feature_importance = get_feature_importance( model, test_dict['features']) persist_local(predictions, args, 'predictions', [ 'experiment_id', 'approach_id', 'learner_id', 'fold_name' ]) persist_local( model, args, 'models', ['experiment_id', 'approach_id', 'learner_id'], '.p') persist_evaluation(evaluations, args) persist_feature_importance(feature_importance, args) except TimeoutError as error: error = f'timeout < {experiment["model_config"]["max_seconds"]}' persist_errors(error, args) if experiment['model_config']['errors']: raise continue except Exception as e: persist_errors(e, args) if experiment['model_config']['errors']: raise continue