Exemplo n.º 1
0
def _es_linker(project_id, data_params, module_params):
    '''
    Runs the recoding module
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: 
            none
                {
                "module_name": module to fetch from (source)
                "file_name": file to fetch (source)
                }
        - module_params: {
                "index_name": name of the Elasticsearch index to fetch from
                "query_template": 
                "threshold": minimum value of score for this query_template for a match
                "must": terms to filter by field (AND: will include ONLY IF ALL are in text)
                "must_not": terms to exclude by field from search (OR: will exclude if ANY is found)
                "exact_pairs": (optional)
                "non_matching_pairs": (optional)
                }
    '''
    # Problem: what project are we talking about? what ID?

    proj = ESLinker(project_id)
    _, run_info = proj.linker('es_linker', None, module_params)
    proj.write_data()

    return run_info
Exemplo n.º 2
0
def _dedupe_linker(project_id, *argv):
    '''
    Runs deduper module. Contrary to other modules, linker modules, take
    paths as input (in addition to module parameters)
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: none
        - module_params: none
        
    # Todo: deprecate
    '''

    proj = ESLinker(
        project_id=project_id)  # Ref and source are loaded by default

    paths = proj._gen_paths_dedupe()

    col_matches = proj.read_col_matches()
    my_variable_definition = proj._gen_dedupe_variable_definition(col_matches)

    module_params = {
        'variable_definition': my_variable_definition,
        'selected_columns_from_source': None,
        'selected_columns_from_ref': None
    }

    # TODO: This should probably be moved
    logging.info('Performing linking')

    # Perform linking
    proj.linker('dedupe_linker', paths, module_params)

    logging.info('Writing data')
    # Write transformations and log
    proj.write_data()

    file_path = proj.path_to(proj.mem_data_info['module_name'],
                             proj.mem_data_info['file_name'])
    logging.info('Wrote data to: {0}'.format(file_path))

    return {}