示例#1
0
def copy_production_schema_to_sqlite(production_path):

    con_sqlite = sqlite3.connect(production_path + 'features.db')

    with connect_to_database().connect() as con:

        metadata = sqlalchemy.MetaData(bind=con, schema='production')
        metadata.reflect()

    for schema, table in map(lambda x: x.split('.'), metadata.tables.keys()):

        content = pd.read_sql_table(table,
                                    schema=schema,
                                    con=connect_to_database())
        content.to_sql(table, con=con_sqlite, if_exists='replace', index=False)
示例#2
0
def load_documents_to_database(file_path, method, table, schema, how='append'):
    """
    Load csv file with text characteristics into DB.
    The csv is expected to be the output of text extraction, with the columns:
        filename, is_extractable, number_of_pages

    Parameters
    ----------
    file_path: string
        Complete path to csv file to load.
    method: string
        Method used to extract the text: 'pdfminer', 'pypdf', 'tika'.
    table: string
        Name of the target table to upload data.
    schema: string
        Name of the schema where the target table is.
    how: string
        In case the table already exists, what should happen: 'fail', 'replace', 'append' (default).
    """

    con = connect_to_database()

    documents_df = pd.read_csv(file_path, delimiter=',')
    documents_df['method'] = method

    load_df = join_filename_with_id(df=documents_df)

    load_pandas_to_db(df=load_df, table=table, schema=schema, how=how)
def create_labels(args):
    """
    Function to obtain a dataframe of labels from experiment file corresponding
    to cohort

    Parameters
    ----------
    experiment: dict
        Experiment file with model parameters

    Return
    ---------
    pd.DataFrame
        Dataframe of IDs and labels
    """

    experiment = get_experiment(args['experiment_id'])
    features = get_local(args, 'features')['id_llamado']

    query ="""
        select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, 
        labels.reception_date, {label_target} as target
        from semantic.labels labels
        join semantic.tenders tenders
        on labels.id_llamado = tenders.id_llamado
        where labels.id_llamado in ({cohort})
    """.format(cohort=experiment['cohort_config']['query'],
               label_target=experiment['label_config']['query'])

    con = utils.connect_to_database()
    labels = pd.read_sql_query(query, con)

    labels = labels[labels['id_llamado'].isin(features)]

    persist_local(labels, args, 'labels')
def get_data_from_db(query, as_pandas=False):
    """
    Gets query result from database

    Parameters
    ----------
    query: string
        SQL query

    as_pandas: bool
        Either if you want the result as a pd.DataFrame

    Return
    ------
    sqlalchemy.engine.result.ResultProxy or pd.DataFrame
        Content of the query
    """

    with utils.connect_to_database().connect() as con:

        result = con.execute(query)

    if as_pandas:

        return pd.DataFrame(result, columns=result.keys())

    return result
示例#5
0
def create_production_schema_postgresql():

    to_production_sql = Path(
        os.path.abspath('joaoc-experiment-checks')
    ).parent.parent.parent / 'sql' / 'production' / 'create-production-schema.sql'

    with connect_to_database().connect() as con:
        for query in to_production_sql.open('r').read().split(';')[:-1]:
            res = con.execute(query)
def create_features(args):
    """
    Function to obtain features specified in the experiment file.
    Function will loop over all the features.

    Parameters:
    ------------
    experiment: dict
        Experiment file with model parameters

    Return:
    ------------
    pd.DataFrame
        A dataframe of features corresponding to each cohort
    """

    experiment = get_experiment(args['experiment_id'])

    query_config = """with cd_tenders as (
            {cohort}
            )
                select cd_tenders.id_llamado, {columns}
                from cd_tenders
                left join {table} as feature_table
                on cd_tenders.id_llamado = feature_table.id_llamado
        """

    con = utils.connect_to_database()

    features_combined = pd.DataFrame()

    for feature_config in experiment['features']:

        query = query_config.format(cohort=experiment['cohort_config']['query'],
                                    columns=','.join(
                                        feature_config['columns']),
                                    table=feature_config['table'])

        features = pd.read_sql_query(query, con)

        if features_combined.empty:
            features_combined = features
        else:
            features_combined = features_combined.merge(
                features, on='id_llamado', how='inner')

    # print(features_combined.columns)

    features_combined = features_combined.dropna()

    persist_local(features_combined, args, 'features')
示例#7
0
def fetch_data(experiment_id):

    con = utils.connect_to_database()

    query = f"""
    select a.*, b.name
    from experiments.evaluations a
    inner join experiments.approaches b
    on a.experiment_id = b.experiment_id and a.approach_id = b.approach_id 
    where a.experiment_id = {experiment_id} 
    """

    return pd.read_sql_query(query, con)[[
        'learner_id', 'fold', 'eval_metric', 'score', 'experiment_id',
        'approach_id', 'name'
    ]]
示例#8
0
def get_file_dict(id_llamado):
    """
    Obtain a dictionary of full filepaths given the input IDs

    Parameters
    ----------
    id_llamado : list
        list of IDs to obtain the full filepaths

    Returns
    -------
    file_path_dict : dictionary
        A dictionary of IDs as keys and their corresponding full path 
    """
    con = connect_to_database()

    query = """
    select id_llamado, filename, method
    from semantic.documents
    where is_extractable = true and method = 'tika'
    """

    # Load full list of all id_llamados and their corresponding filenames
    full_list = pd.read_sql_query(query, con)

    #Get the subset based on id_llamados
    subset_list = full_list[full_list['id_llamado'].isin(id_llamado)]

    #Change the filename to .txt instead of .pdf or .PDF
    if len(subset_list) != 0:
        subset_list['filename'] = subset_list.apply(
            lambda x: x['filename'].lower().replace('.pdf', '') + '.txt',
            axis=1)
    else:
        print('Empty dataframe')

    # Generate the full path
    subset_list['fullpath'] = document_path + \
        subset_list['method'] + '/' + subset_list['filename']

    # Extract full path and id_llamado
    file_path_dict = {}
    file_path_dict['id_llamado'] = subset_list['id_llamado'].tolist()
    file_path_dict['fullpath'] = subset_list['fullpath'].tolist()

    return file_path_dict
def insert_to_db(data, schema, table, how='append'):
    """
    Interts dictionary as row in the corresponding table in the database

    Parameters
    ----------
    data: dictonary
        Dict with keys being the columns of the table

    schema: string
        Schema name

    table: string
        Table name

    TODO: how: string

    Returns
    -------
    dictonary
        Status of the task
    """

    engine = utils.connect_to_database()
    con = engine.connect()

    metadata = sqlalchemy.MetaData(bind=con, schema=schema)
    metadata.reflect()

    if isinstance(data, dict):
        data = [data]

    # insertions = []
    # for datum in data:
    #     insertions.append(metadata.tables[f'{schema}.{table}'].insert().values(
    #         datum
    #     ))

    try:
        metadata.tables[f'{schema}.{table}'].insert().execute(data)

    except sqlalchemy.exc.IntegrityError:
        pass

    con.close()
    engine.dispose()
def prepare_to_selection(selected_learners, configuration):

    con = connect_to_database()
    
    selected_data = get_data(selected_learners)

    bias = download_bias_data(query=configuration['query'], con=con)
    bias = filter_bias_data(bias, configuration['groups'])

    for data in selected_data:
        data['labels'] = label_data(data['results'])
        data['bias'] = add_bias(data['labels'], bias)

        data['bias']['model_id'] = data['learner_id']
        data['aequitas'] = get_aequitas(data['bias'], )

    melted_aequitas = melt_aequitas(selected_data)
    
    return melted_aequitas, selected_data
def fetch_data(selector_config):

    con = connect_to_database()

    query = f"""
    select evaluation.*, approach.name
    from (select *
    from experiments.evaluations
    where experiment_id = {selector_config['experiment_id']}) evaluation
    left join experiments.approaches approach
    on evaluation.experiment_id = approach.experiment_id
    and evaluation.approach_id = approach.approach_id
    """

    data = pd.read_sql_query(query, con)[[
        'learner_id', 'fold', 'name', 'eval_metric', 'score', 'experiment_id',
        'approach_id'
    ]]

    return data
def do_plots(experiment_id):

    # Get data on experiment results from database

    con = utils.connect_to_database()

    query = """
    select evaluation.*,approach.name
    from experiments.evaluations evaluation
    left join experiments.approaches approach
    on evaluation.approach_id = approach.approach_id
    """

    df = pd.read_sql_query(query, con)

    # Subselect data on specific experiment id
    data = df.loc[df['experiment_id'] == experiment_id]

    # Set of colors to be used in the plot
    n = len(data['learner_id'])
    color = iter(cm.rainbow(np.linspace(0, 1, n)))

    # Set font size
    plt.rcParams.update({'font.size': 14})

    # Loop to create one fig per metric and a line per learner
    for metric in data['eval_metric'].unique():

        fig, ax1 = plt.subplots(figsize=(15, 8))

        ax1.set_title(f"Metric: {metric}")
        ax1.set_ylabel('score')

        # check if it is k-fold or temporal-fold
        if '-' in data['fold'].iloc[0]:
            ax1.set_xlabel('time')
            plt.xticks(rotation=90)
        else:
            ax1.set_xlabel('fold')

        for approach in data['approach_id'].unique():

            c = next(color)

            for learner in data['learner_id'].unique():

                data_to_plot = data[(data['learner_id'] == learner)
                                    & (data['approach_id'] == approach) &
                                    (data['eval_metric'] == metric)]

                approach_name = data_to_plot['name'].unique()

                ax1.plot(data_to_plot['fold'], data_to_plot['score'], c=c)

                ax1.legend(approach_name)

        persist_local(data=fig,
                      args={
                          'experiment_id': experiment_id,
                          'eval_metric': metric
                      },
                      folder='evaluation_plots',
                      id_keys=['experiment_id', 'eval_metric'],
                      as_type='.png')
# +
import seaborn as sns

fontsize = 20  # General fontsize


def get_index_of_line(line, df, forbiden='forest'):
    line = [i for i in df.unique() if ((line in i) & (forbiden not in i))][0]
    return [i.get_text()
            for i in axis.lines[0].axes.get_legend().texts].index(line) - 1


# -

con = connect_to_database()

# Add here your parameters:
macro_experiment_id = 3481383696

args = dict(experiment_id=macro_experiment_id)

query = f"""
select *
from experiments.errors
where experiment_id in (
    select experiment_id 
    from experiments.parallelization 
    where macro_experiment_id = {macro_experiment_id})
order by created_on desc
"""
示例#14
0
def list_documents_to_extract(output_file):
    """
    Get the list of documents (complete path) from which to extract text and characteristics.

    Parameters
    ----------
    output_file: string
        Output file where previous runs are stored.

    Returns
    -------
    docs_to_check: list
        List of documents.
    """

    docs_to_check = []
    checked_files = set(check_or_create_output_file(output_file))

    con = connect_to_database()

    # Retrieve all documents from DB
    query = f"select id_llamado, nombre_archivo as filename" \
        f" from raw.pbc_adenda" \
        f" where lower(tipo_documento) != 'adenda'" \
        f" and right(lower(nombre_archivo), 3) = 'pdf'"
    all_documents = pd.read_sql_query(query, con)

    # List document folders
    first_pbc = sorted([
        file for file in os.listdir(first_pbc_path)
        if file.lower().endswith('.pdf')
    ])
    all_pbc = sorted([
        file for file in os.listdir(all_pbc_path)
        if file.lower().endswith('.pdf')
    ])
    all_pbc_2 = sorted([
        file for file in os.listdir(more_pbc_path)
        if file.lower().endswith('.pdf')
    ])

    set_first_pbc = set(first_pbc)
    set_all_pbc = set(all_pbc + all_pbc_2)

    ids = all_documents['id_llamado'].unique()
    for i in tqdm(range(len(ids))):
        id = ids[i]
        docs_id = all_documents[all_documents['id_llamado'] ==
                                id].filename.tolist()
        set_docs = set(docs_id)

        # First, check if there are documents in the first PBC folder
        intersect_first = list(set_docs & set_first_pbc)
        len_intersect = len(intersect_first)

        # If there is more than one document in first PBC, there is a problem
        if len_intersect > 1:
            logging.error(f'{id} has {len_intersect} documents in first PBC.')
            continue

        # If there is one document in first PBC, we add it to the output list
        elif len_intersect == 1:
            curr_doc = intersect_first[0]
            if curr_doc in checked_files:
                continue
            else:
                docs_to_check.append(f"{first_pbc_path}/{curr_doc}")

        else:
            intersect_all = list(set_docs & set_all_pbc)

            # If there is no document in first PBC, but only one document existing, we add it
            if len(intersect_all) == 1:
                curr_doc = intersect_all[0]
                if curr_doc in checked_files:
                    continue
                else:
                    if curr_doc in all_pbc:
                        docs_to_check.append(f"{all_pbc_path}/{curr_doc}")
                    else:
                        docs_to_check.append(f"{more_pbc_path}/{curr_doc}")
            # If there is no document in first PBC and more than one in the table, we have a problem
            else:
                logging.error(
                    f'{id} has {len(intersect_all)} documents in PBC folders.')
                continue

    return sorted(list(set(docs_to_check)))
def generate_prioritized_risk_list(data_path,
                                   n_top_features=3,
                                   save_csv=True,
                                   output_folder=''):
    """
    Given a csv file path containing tenders minimum information, prioritize them according to their risk
    and flag those that should be reviewing thoroughly (flagged as quality_review=1).

    Parameters
    -----------
    data_path: string
        Complete path of the csv file where the tenders information is. 
        The minimum information required is: 
            - id_llamado
            - convocante
            - tipo_entidad
            - tipo_procedimiento_codigo
            - categoria
            - _objeto_licitacion
            - monto_global_estimado_planificacion
            - _moneda
            - reception_date
    n_top_features: int
        Number of top features wanted. By default, the top 3 features will be displayed.
    save_csv: bool
        Save output to csv. By default, the output csv will be saved.
    output_folder: string
        Folder where to output the csv file. By default, the root path.

    Returns
    -------
    csv_df: DataFrame
        Prioritized dataframe.
    """

    # Load arguments of the best model
    args = pickle.load(open(f"{production_path}/best_model_args.p", 'rb'))

    # Connect to DB
    con = connect_to_database()

    # Load data
    original_df = pd.read_csv(data_path)

    # Read data and prepare it to start the ML pipeline
    features_df = generate_features_from_new_data(original_df, args, con)

    # Load encoders (model and preprocessor)
    model = load_encoder(args, type='model')
    preprocessor = load_encoder(args, type='preprocessing')

    # Apply preprocessing
    features, _ = preprocessing.run(preprocessors=list(preprocessor),
                                    data=features_df,
                                    preprocessing=preprocessor,
                                    fit=False)

    # Predict risk scores
    scores = [p[1] for p in model.predict_proba(features)]

    # Get important features for each individual tender
    features_list = get_feature_importance(model, features, n_top_features)

    # Define final output csv file
    csv_df = original_df.copy()
    csv_df['complaint_risk'] = scores
    csv_df = pd.concat(
        [csv_df,
         pd.DataFrame(features_list).add_prefix('risk_factor_')],
        axis=1)

    # Sort by risk score
    csv_df = csv_df.sort_values(by=[
        'complaint_risk'
    ], ascending=False).drop(columns=['complaint_risk']).reset_index(drop=True)

    # Flag quality reviews, according to best model k%
    n_flags = max(1, round(args['k'] * csv_df.shape[0]))
    csv_df['quality_review'] = [1] * n_flags + [0
                                                ] * (csv_df.shape[0] - n_flags)

    if save_csv:
        csv_df.to_csv(
            f'{output_folder}/{dt.date.today()}_prioritized_tenders.csv',
            index=False,
            header=True)

    return csv_df