예제 #1
0
def run_tfidf(fold, args):
    """Wrapper function that runs the process of TFIDF after preprocessing
    
    Parameters
    ----------
    fold : dict
        id_llamado lists to filter features and labels
    args : dict
        dictionary of parameters to be passed into the TFIDF algo
    
    Returns
    -------
    pd.DataFrame
        train and test dataframes for train and test document features
    """

    fold_id = {'fold_id': generate_id(str(fold) + str(args['params']))}

    if check_if_local_exists(fold_id, 'tfidf-train', ['fold_id']):
        tfidf_features_train = get_local(fold_id,
                                         'tfidf-train',
                                         id_keys=['fold_id'],
                                         as_type='.parquet.gz')
        tfidf_features_test = get_local(fold_id,
                                        'tfidf-test',
                                        id_keys=['fold_id'],
                                        as_type='.parquet.gz')

    else:
        # Get the processed list of texts for both train and test
        train_id, train_text = tfidf_preprocess(fold['train'])
        test_id, test_text = tfidf_preprocess(fold['test'])

        # Get train and test document features sets
        stop_words = set(stopwords.words('spanish'))
        # Get TFIDF encoder
        tfidf_encode = vector_fit(train_text, args['params'], stop_words)
        # Get train and test dataframes
        tfidf_features_train = vector_transform(train_text, train_id,
                                                tfidf_encode)
        tfidf_features_test = vector_transform(test_text, test_id,
                                               tfidf_encode)

        persist_local(tfidf_encode,
                      args,
                      'tfidf', ['experiment_id'],
                      as_type='.p')

        persist_local(tfidf_features_train, fold_id, 'tfidf-train',
                      ['fold_id'])
        persist_local(tfidf_features_test, fold_id, 'tfidf-test', ['fold_id'])

    return tfidf_features_train, tfidf_features_test
def create_labels(args):
    """
    Function to obtain a dataframe of labels from experiment file corresponding
    to cohort

    Parameters
    ----------
    experiment: dict
        Experiment file with model parameters

    Return
    ---------
    pd.DataFrame
        Dataframe of IDs and labels
    """

    experiment = get_experiment(args['experiment_id'])
    features = get_local(args, 'features')['id_llamado']

    query ="""
        select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, 
        labels.reception_date, {label_target} as target
        from semantic.labels labels
        join semantic.tenders tenders
        on labels.id_llamado = tenders.id_llamado
        where labels.id_llamado in ({cohort})
    """.format(cohort=experiment['cohort_config']['query'],
               label_target=experiment['label_config']['query'])

    con = utils.connect_to_database()
    labels = pd.read_sql_query(query, con)

    labels = labels[labels['id_llamado'].isin(features)]

    persist_local(labels, args, 'labels')
def generate_temporal_folds(experiment, args):
    """ Given a label table and temporal parameters, it generates temporal folds.

    Parameters
    ----------
    experiment : dict
        Paramenters of to perform the experiment
    args : dict
        Minimum set of parameters to run the pipeline

    Returns
    -------
    list of dicts
        All folds information, by as of date. It carries the ids to filter the
        tables
    """

    params = experiment['validation']['parameters']
    current_aod = dt.datetime.strptime(params['as_of_date'], '%Y-%m-%d')

    labels = get_local(args, 'labels')
    X = labels[['id_llamado', 'reception_date']]
    y = labels['target']

    k = 1
    folds = []
    while True:
        test_end = current_aod + dt.timedelta(days=params['test_lag'])

        if test_end > dt.datetime.strptime(
                params['test_date_limit'], '%Y-%m-%d'):
            break

        if params['number_of_folds'] is not None:
            if k > params['number_of_folds']:
                break

        # If train_lag is 'all', train_start is set to a dummy old date
        # (2000-01-01)
        train_start = (current_aod - dt.timedelta(days=params['train_lag']) - dt.timedelta(days=params['blind_gap'])) \
            if params['train_lag'] != 'all' else dt.datetime(2000, 1, 1)
        train_end = current_aod - dt.timedelta(days=params['blind_gap'])

        train_ids = X.query(
            f"reception_date >= '{train_start}' and reception_date <= '{train_end}'")['id_llamado']
        test_ids = X.query(
            f"reception_date >= '{current_aod}' and reception_date <= '{test_end}'")['id_llamado']

        folds.append({
            'name': dt.datetime.strftime(current_aod, '%Y-%m-%d'),
            'train': train_ids.tolist(),
            'test': test_ids.tolist()
        })

        current_aod = current_aod + dt.timedelta(days=params['aod_lag'])
        k = k + 1

    persist_local(folds, args, 'folds', as_type='.p')

    return folds
def generate_kfolds(experiment, args):
    """ Given a label table, it generates random stratified folds.

    Parameters
    ----------
    experiment : dict
        Paramenters of to perform the experiment
    args : dict
        Minimum set of parameters to run the pipeline

    Returns
    -------
    list of dicts
        All folds information. It carries the ids to filter the
        tables
    """

    skf = StratifiedKFold(n_splits=experiment['validation']['parameters']['number_of_folds'],
                          random_state=experiment['model_config']['random_seed'])

    labels = get_local(args, 'labels')
    X = labels['id_llamado']
    y = labels['target']

    folds = []

    for i, index in enumerate(skf.split(X, y)):

        folds.append({
            'name': i,
            'train': X[index[0]].tolist(),
            'test': X[index[1]].tolist()
        })
    
    return folds
def get_data(selected_learners):
    
    final = []
    for args in selected_learners.to_dict('records'):
        
        args['fold_name'] = '2017-12-26'
        
        try: 
        
            final.append(dict(
                learner_id=args['learner_id'],
                experiment_id=args['experiment_id'],
                results=get_local(args, 'predictions', 
                              id_keys=['experiment_id', 'approach_id', 'learner_id', 'fold_name'])\
                            .merge(get_local(args, 'labels').set_index('id_llamado'),
                            left_index=True, right_index=True)[['reception_date', 'prediction', 'target']]
                            ))
        except:
            
            print('debug')

    return final
예제 #6
0
def save_model(production_path, ids):

    model = get_local(ids.to_dict(),
                      folder='models',
                      id_keys=['experiment_id', 'approach_id', 'learner_id'],
                      as_type='.p')

    persist_local(
        model,
        args={
            **ids.to_dict(),
            **{
                'preffix': 'model'
            }
        },
        folder=None,
        id_keys=['preffix', 'experiment_id', 'approach_id', 'learner_id'],
        as_type='.p',
        save_path=production_path)
    def complaints_per_fold(args, data):

        folds = pickle.load(
            open(f'/data/persist/folds/{args["experiment_id"]}.p', 'rb'))
        labels = get_local(args, 'labels').set_index('id_llamado')[['target']]

        i = 0
        complaints = []
        for fold in folds:

            if fold['name'] in list(data['fold'].unique()):
                complaints.append({
                    'complaints':
                    100 * labels.loc[fold['test']].sum().values[0] /
                    len(labels.loc[fold['test']]),
                    'fold':
                    fold['name']
                })

        return pd.DataFrame(complaints)
예제 #8
0
def save_preprocessor(production_path, ids, max_fold):

    content = get_local({
        **ids.to_dict(),
        **{
            'fold': max_fold
        }
    },
                        folder='preprocessing',
                        id_keys=['experiment_id', 'approach_id', 'fold'],
                        as_type='.dill')

    persist_local(content,
                  args={
                      **ids.to_dict(),
                      **{
                          'fold': max_fold,
                          'preffix': 'prepro'
                      }
                  },
                  folder=None,
                  id_keys=['preffix', 'experiment_id', 'approach_id', 'fold'],
                  as_type='.dill',
                  save_path=production_path)
"""

good_learners.index[1]

df = pd.read_sql_query(query.format(learner_id=good_learners.index[0]), con)

# %matplotlib inline

import numpy as np

df['importance_log'] = df['importance'].apply(lambda x: np.log10(x) + 1)

df

df.head(20).plot(
    x='feature',
    y='importance_log',
    kind='barh',
)

df.head(10)

# ### Features

from pipeline.data_persistence import get_local
features = get_local(args, 'features').set_index('id_llamado')

features

test('a')
예제 #10
0
    groups:
        CD: director1
        CO: director2
        LPN: director3
        LPI: director3
        LC: director3
        CE: director3
    
"""

evaluation = yaml.load(evaluation)

experiment_id = 7136875758

features = get_local({
    'experiment_id': experiment_id
}, 'features').set_index('id_llamado')
labels = get_local({
    'experiment_id': experiment_id
}, 'labels').set_index('id_llamado')

length = 10000
labels = labels.sort_values(by='reception_date', ascending=False)
predictions = labels[:length][[]]
predictions['prediction'] = [random.random() for i in range(length)]
observertions = labels[:length]


# +
def classifing(df, args):
import pipeline.preprocessing as pr
import pipeline.pipeline as pi
import pipeline.data_persistence as dp
import pipeline.model_data_prep as mp
import utils.utils as utils

from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()

from sklearn import preprocessing

args = dict(experiment_id=3113067571)

folds = mp.create_folds(args)

features = dp.get_local(args, 'features').set_index('id_llamado')
labels = dp.get_local(args, 'labels').set_index('id_llamado')[['target']]

approaches = dp.get_approaches(args['experiment_id'])

approach = approaches[0]


# +
def split_by_type(data,
                  numeric_types=['int64', 'float64'],
                  object_types=['object']):

    return {
        'numeric': data.select_dtypes(numeric_types),
        'object': data.select_dtypes(object_types),
예제 #12
0
def get_features(ids):

    return list(
        get_local({
            'experiment_id': ids['experiment_id']
        }, 'features').set_index('id_llamado').columns)
def loop_the_grid(args):
    """
    Given the experiment file with experiment parameters, the list of
    temporal_folds as well as the data dictionary prepared by the
    model_data_prep function, the function loops through the various temporal folds
    and the list of approaches specified in the experiment file to calculate
    metrics specified in the experiment file.

    Parameters
    ----------
    args: dictionary
        Minimum set of arguments to start functions.
    """

    experiment = get_experiment(args['experiment_id'])
    approaches = get_approaches(args['experiment_id'])

    features = get_local(args, 'features').set_index('id_llamado')
    labels = get_local(args, 'labels').set_index('id_llamado')

    #Check if textprocessing is needed:
    if 'textprocessing' in experiment:
        args_tfidf = {}
        args_tfidf['params'] = experiment['textprocessing']['tfidf']
        args_tfidf['experiment_id'] = args['experiment_id']
    else:
        args_tfidf = {}

    print('Approaches: ', ', '.join([k['name'] for k in approaches]))

    for fold in tqdm(args['folds'], desc='Folds'):

        args['fold_name'] = fold['name']

        original_train_dict, original_test_dict = generate_folds_matrices(
            features, labels, fold, args_tfidf)

        for approach in tqdm(approaches, desc='Approaches'):

            args['approach_id'] = approach['approach_id']
            args['approach_name'] = approach['name']

            train_dict, test_dict = \
            apply_preprocessing(approach, original_train_dict, original_test_dict,
                                                        args)

            for hyperparameters in tqdm(generate_hyperparameters_combinations(
                    approach['hyperparameters']),
                                        desc='Hyper'):

                args['hyperparameters'] = hyperparameters
                args = persist_learner(args)

                try:
                    max_run_time(experiment['model_config']['max_seconds'])

                    mod = importlib.import_module(
                        f"pipeline.approaches.{approach['python_path'][:-3]}")
                    model = mod.fit(args, train_dict=train_dict)

                    predictions = mod.predict(
                        model, test_features=test_dict['features'])

                    evaluations = evaluate(obs=test_dict['labels'],
                                           pred=predictions,
                                           evaluation=experiment['evaluation'])

                    feature_importance = get_feature_importance(
                        model, test_dict['features'])

                    persist_local(predictions, args, 'predictions', [
                        'experiment_id', 'approach_id', 'learner_id',
                        'fold_name'
                    ])
                    persist_local(
                        model, args, 'models',
                        ['experiment_id', 'approach_id', 'learner_id'], '.p')
                    persist_evaluation(evaluations, args)
                    persist_feature_importance(feature_importance, args)

                except TimeoutError as error:
                    error = f'timeout < {experiment["model_config"]["max_seconds"]}'
                    persist_errors(error, args)

                    if experiment['model_config']['errors']:
                        raise

                    continue

                except Exception as e:
                    persist_errors(e, args)
                    if experiment['model_config']['errors']:
                        raise
                    continue