Exemplo n.º 1
0
def extract_csv(classification_csv,
                config,
                output_dir=os.path.abspath('.'),
                output_name='extractions',
                order=False,
                verbose=False):
    config = get_file_instance(config)
    with config as config_in:
        config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader)

    extractor_config = config_yaml['extractor_config']
    workflow_id = config_yaml['workflow_id']
    version = config_yaml['workflow_version']

    blank_extracted_data = OrderedDict([('classification_id', []),
                                        ('user_name', []), ('user_id', []),
                                        ('workflow_id', []), ('task', []),
                                        ('created_at', []), ('subject_id', []),
                                        ('extractor', []), ('data', [])])

    extracted_data = {}

    classification_csv = get_file_instance(classification_csv)
    with classification_csv as classification_csv_in:
        classifications = pandas.read_csv(classification_csv_in,
                                          encoding='utf-8',
                                          dtype={'workflow_version': str})

    wdx = classifications.workflow_id == workflow_id
    assert (
        wdx.sum() >
        0), 'There are no classifications matching the configured workflow ID'
    if '.' in version:
        vdx = classifications.workflow_version == version
    else:
        vdx = classifications.workflow_version.apply(
            get_major_version) == version

    assert (
        vdx.sum() > 0
    ), 'There are no classificaitons matching the configured version number'
    assert (
        (vdx & wdx).sum() > 0
    ), 'There are no classifications matching the combined workflow ID and version number'

    widgets = [
        'Extracting: ',
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   max_value=(wdx & vdx).sum())
    counter = 0
    pbar.start()
    for cdx, classification in classifications[wdx & vdx].iterrows():
        classification_by_task = annotation_by_task(
            {'annotations': json.loads(classification.annotations)})
        for extractor_name, keywords in extractor_config.items():
            extractor_key = extractor_name
            if 'shape_extractor' in extractor_name:
                extractor_key = 'shape_extractor'
            for keyword in keywords:
                if extractor_key in extractors.extractors:
                    try:
                        extract = extractors.extractors[extractor_key](
                            copy.deepcopy(classification_by_task), **keyword)
                    except:
                        if verbose:
                            print()
                            print('Incorrectly formatted annotation')
                            print(classification)
                            print(extractor_key)
                            print(classification_by_task)
                            print(keyword)
                        continue
                    if isinstance(extract, list):
                        for e in extract:
                            extracted_data.setdefault(
                                extractor_name,
                                copy.deepcopy(blank_extracted_data))
                            extracted_data[extractor_name][
                                'classification_id'].append(
                                    classification.classification_id)
                            extracted_data[extractor_name]['user_name'].append(
                                classification.user_name)
                            extracted_data[extractor_name]['user_id'].append(
                                classification.user_id)
                            extracted_data[extractor_name][
                                'workflow_id'].append(
                                    classification.workflow_id)
                            extracted_data[extractor_name]['task'].append(
                                keyword['task'])
                            extracted_data[extractor_name][
                                'created_at'].append(classification.created_at)
                            extracted_data[extractor_name][
                                'subject_id'].append(
                                    classification.subject_ids)
                            extracted_data[extractor_name]['extractor'].append(
                                extractor_name)
                            extracted_data[extractor_name]['data'].append(e)
                    else:
                        extracted_data.setdefault(
                            extractor_name,
                            copy.deepcopy(blank_extracted_data))
                        extracted_data[extractor_name][
                            'classification_id'].append(
                                classification.classification_id)
                        extracted_data[extractor_name]['user_name'].append(
                            classification.user_name)
                        extracted_data[extractor_name]['user_id'].append(
                            classification.user_id)
                        extracted_data[extractor_name]['workflow_id'].append(
                            classification.workflow_id)
                        extracted_data[extractor_name]['task'].append(
                            keyword['task'])
                        extracted_data[extractor_name]['created_at'].append(
                            classification.created_at)
                        extracted_data[extractor_name]['subject_id'].append(
                            classification.subject_ids)
                        extracted_data[extractor_name]['extractor'].append(
                            extractor_name)
                        extracted_data[extractor_name]['data'].append(extract)
        counter += 1
        pbar.update(counter)
    pbar.finish()

    # create one flat csv file for each extractor used
    output_base_name, output_ext = os.path.splitext(output_name)
    output_files = []
    for extractor_name, data in extracted_data.items():
        output_path = os.path.join(
            output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name))
        output_files.append(output_path)
        flat_extract = flatten_data(data)
        if order:
            flat_extract = order_columns(flat_extract, front=['choice'])
        flat_extract.to_csv(output_path, index=False, encoding='utf-8')
    return output_files
def reduce_csv(
    extracted_csv,
    reducer_config,
    filter='first',
    output_name='reductions',
    output_dir=CURRENT_PATH,
    order=False,
    stream=False,
    cpu_count=1
):
    extracted_csv = get_file_instance(extracted_csv)
    with extracted_csv as extracted_csv_in:
        extracted = pandas.read_csv(
            extracted_csv_in,
            infer_datetime_format=True,
            parse_dates=['created_at'],
            encoding='utf-8'
        )

    extracted.sort_values(['subject_id', 'created_at'], inplace=True)
    resume = False
    subjects = extracted.subject_id.unique()
    tasks = extracted.task.unique()
    workflow_id = extracted.workflow_id.iloc[0]

    reducer_config = get_file_instance(reducer_config)
    with reducer_config as config:
        config_yaml = yaml.load(config, Loader=yaml.SafeLoader)

    assert (len(config_yaml['reducer_config']) == 1), 'There must be only one reducer in the config file.'
    for key, value in config_yaml['reducer_config'].items():
        reducer_name = key
        keywords = value
    assert (reducer_name in reducers.reducers), 'The reducer in the config files does not exist.'

    output_base_name, _ = os.path.splitext(output_name)
    output_path = os.path.join(output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name))

    if stream:
        if os.path.isfile(output_path):
            print('resuming from last run')
            resume = True
            with open(output_path, 'r', encoding='utf-8') as reduced_file:
                reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8')
                subjects = np.setdiff1d(subjects, reduced_csv.subject_id)

    reduced_data = []
    sdx = 0

    apply_keywords = {
        'reducer_name': reducer_name,
        'workflow_id': workflow_id,
        'filter': filter,
        'keywords': keywords
    }

    widgets = [
        'Reducing: ',
        progressbar.Percentage(),
        ' ', progressbar.Bar(),
        ' ', progressbar.ETA()
    ]
    number_of_rows = len(subjects) * len(tasks)
    pbar = progressbar.ProgressBar(widgets=widgets, max_value=number_of_rows)

    def callback(reduced_data_list):
        nonlocal reduced_data
        nonlocal sdx
        nonlocal pbar
        nonlocal stream
        reduced_data += reduced_data_list
        if stream:
            if (sdx == 0) and (not resume):
                pandas.DataFrame(reduced_data).to_csv(
                    output_path,
                    mode='w',
                    index=False,
                    encoding='utf-8'
                )
            else:
                pandas.DataFrame(reduced_data).to_csv(
                    output_path,
                    mode='a',
                    index=False,
                    header=False,
                    encoding='utf-8'
                )
            reduced_data.clear()
        sdx += 1
        pbar.update(sdx)

    pbar.start()
    if cpu_count > 1:
        pool = Pool(cpu_count)
    for subject in subjects:
        idx = extracted.subject_id == subject
        for task in tasks:
            jdx = extracted.task == task
            classifications = extracted[idx & jdx]
            if cpu_count > 1:
                pool.apply_async(
                    reduce_subject,
                    args=(
                        subject,
                        classifications,
                        task
                    ),
                    kwds=apply_keywords,
                    callback=callback
                )
            else:
                reduced_data_list = reduce_subject(
                    subject,
                    classifications,
                    task,
                    **apply_keywords
                )
                callback(reduced_data_list)
    if cpu_count > 1:
        pool.close()
        pool.join()
    pbar.finish()

    if stream:
        reduced_csv = pandas.read_csv(output_path, encoding='utf-8')
        if 'data' in reduced_csv:
            def eval_func(a):
                # pandas uses a local namespace, make sure it has the correct imports
                from collections import OrderedDict  # noqa
                from numpy import nan  # noqa
                return eval(a)
            reduced_csv.data = reduced_csv.data.apply(eval_func)
            flat_reduced_data = flatten_data(reduced_csv)
        else:
            return output_path
    else:
        non_flat_data = pandas.DataFrame(reduced_data)
        flat_reduced_data = flatten_data(non_flat_data)
    if order:
        flat_reduced_data = order_columns(flat_reduced_data, front=['choice', 'total_vote_count', 'choice_count'])
    flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8')
    return output_path
 def test_order_columns(self):
     '''Test order columns'''
     result = csv_utils.order_columns(unordered_data, front=['choice'])
     assert_frame_equal(result, ordered_data)
Exemplo n.º 4
0
def extract_csv(
    classification_csv,
    config,
    output_dir=CURRENT_PATH,
    output_name='extractions',
    order=False,
    verbose=False,
    cpu_count=1
):
    config = get_file_instance(config)
    with config as config_in:
        config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader)

    extractor_config = config_yaml['extractor_config']
    workflow_id = config_yaml['workflow_id']
    version = config_yaml['workflow_version']
    number_of_extractors = sum([len(value) for key, value in extractor_config.items()])

    extracted_data = defaultdict(list)

    classification_csv = get_file_instance(classification_csv)
    with classification_csv as classification_csv_in:
        classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})

    wdx = classifications.workflow_id == workflow_id
    assert (wdx.sum() > 0), 'There are no classifications matching the configured workflow ID'
    if '.' in version:
        vdx = classifications.workflow_version == version
    else:
        vdx = classifications.workflow_version.apply(get_major_version) == version

    assert (vdx.sum() > 0), 'There are no classificaitons matching the configured version number'
    assert ((vdx & wdx).sum() > 0), 'There are no classifications matching the combined workflow ID and version number'

    widgets = [
        'Extracting: ',
        progressbar.Percentage(),
        ' ', progressbar.Bar(),
        ' ', progressbar.ETA()
    ]
    max_pbar = (wdx & vdx).sum() * number_of_extractors
    pbar = progressbar.ProgressBar(widgets=widgets, max_value=max_pbar)
    counter = 0

    def callback(name_with_row):
        nonlocal extracted_data
        nonlocal counter
        nonlocal pbar
        extractor_name, new_extract_row = name_with_row
        if new_extract_row is not None:
            extracted_data[extractor_name] += new_extract_row
        counter += 1
        pbar.update(counter)

    pbar.start()
    if cpu_count > 1:
        pool = Pool(cpu_count)
    for _, classification in classifications[wdx & vdx].iterrows():
        classification_by_task = annotation_by_task({
            'annotations': json.loads(classification.annotations),
            'metadata': json.loads(classification.metadata)
        })
        classification_info = {
            'classification_id': classification.classification_id,
            'user_name': classification.user_name,
            'user_id': classification.user_id,
            'workflow_id': classification.workflow_id,
            'created_at': classification.created_at,
            'subject_ids': classification.subject_ids
        }
        for extractor_name, keywords in extractor_config.items():
            extractor_key = extractor_name
            if 'shape_extractor' in extractor_name:
                extractor_key = 'shape_extractor'
            for keyword in keywords:
                if extractor_key in extractors.extractors:
                    if cpu_count > 1:
                        pool.apply_async(
                            extract_classification,
                            args=(
                                copy.deepcopy(classification_by_task),
                                classification_info,
                                extractor_key,
                                extractor_name,
                                keyword,
                                verbose
                            ),
                            callback=callback
                        )
                    else:
                        name_with_row = extract_classification(
                            copy.deepcopy(classification_by_task),
                            classification_info,
                            extractor_key,
                            extractor_name,
                            keyword,
                            verbose
                        )
                        callback(name_with_row)
                else:
                    callback((None, None))
    if cpu_count > 1:
        pool.close()
        pool.join()
    pbar.finish()

    # create one flat csv file for each extractor used
    output_base_name, _ = os.path.splitext(output_name)
    output_files = []
    for extractor_name, data in extracted_data.items():
        output_path = os.path.join(output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name))
        output_files.append(output_path)
        non_flat_extract = pandas.DataFrame(data)
        flat_extract = flatten_data(non_flat_extract)
        if order:
            flat_extract = order_columns(flat_extract, front=['choice'])
        flat_extract.to_csv(output_path, index=False, encoding='utf-8')
    return output_files
Exemplo n.º 5
0
def reduce_csv(extracted_csv,
               reducer_config,
               filter='first',
               output_name='reductions',
               output_dir=os.path.abspath('.'),
               order=False,
               stream=False):
    extracted_csv = get_file_instance(extracted_csv)
    with extracted_csv as extracted_csv_in:
        extracted = pandas.read_csv(extracted_csv_in,
                                    infer_datetime_format=True,
                                    parse_dates=['created_at'],
                                    encoding='utf-8')

    extracted.sort_values(['subject_id', 'created_at'], inplace=True)
    resume = False
    subjects = extracted.subject_id.unique()
    tasks = extracted.task.unique()
    workflow_id = extracted.workflow_id.iloc[0]

    reducer_config = get_file_instance(reducer_config)
    with reducer_config as config:
        config_yaml = yaml.load(config, Loader=yaml.SafeLoader)

    assert (len(config_yaml['reducer_config']) == 1
            ), 'There must be only one reducer in the config file.'
    for key, value in config_yaml['reducer_config'].items():
        reducer_name = key
        keywords = value
    assert (reducer_name in reducers.reducers
            ), 'The reducer in the config files does not exist.'

    output_base_name, output_ext = os.path.splitext(output_name)
    output_path = os.path.join(
        output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name))

    if stream:
        if os.path.isfile(output_path):
            print('resuming from last run')
            resume = True
            with open(output_path, 'r', encoding='utf-8') as reduced_file:
                reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8')
                subjects = np.setdiff1d(subjects, reduced_csv.subject_id)

    blank_reduced_data = OrderedDict([('subject_id', []), ('workflow_id', []),
                                      ('task', []), ('reducer', []),
                                      ('data', [])])

    reduced_data = copy.deepcopy(blank_reduced_data)

    widgets = [
        'Reducing: ',
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]

    pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(subjects))
    pbar.start()
    for sdx, subject in enumerate(subjects):
        idx = extracted.subject_id == subject
        for task in tasks:
            jdx = extracted.task == task
            classifications = extracted[idx & jdx]
            classifications = classifications.drop_duplicates()
            if filter in FILTER_TYPES:
                classifications = classifications.groupby(
                    ['user_name'],
                    group_keys=False).apply(FILTER_TYPES[filter])
            data = [unflatten_data(c) for cdx, c in classifications.iterrows()]
            reduction = reducers.reducers[reducer_name](data, **keywords)
            if isinstance(reduction, list):
                for r in reduction:
                    reduced_data['subject_id'].append(subject)
                    reduced_data['workflow_id'].append(workflow_id)
                    reduced_data['task'].append(task)
                    reduced_data['reducer'].append(reducer_name)
                    reduced_data['data'].append(r)
            else:
                reduced_data['subject_id'].append(subject)
                reduced_data['workflow_id'].append(workflow_id)
                reduced_data['task'].append(task)
                reduced_data['reducer'].append(reducer_name)
                reduced_data['data'].append(reduction)
        if stream:
            if (sdx == 0) and (not resume):
                pandas.DataFrame(reduced_data).to_csv(output_path,
                                                      mode='w',
                                                      index=False,
                                                      encoding='utf-8')
            else:
                pandas.DataFrame(reduced_data).to_csv(output_path,
                                                      mode='a',
                                                      index=False,
                                                      header=False,
                                                      encoding='utf-8')
            reduced_data = copy.deepcopy(blank_reduced_data)
        pbar.update(sdx + 1)
    pbar.finish()

    if stream:
        reduced_csv = pandas.read_csv(output_path, encoding='utf-8')
        if 'data' in reduced_csv:
            reduced_csv.data = reduced_csv.data.apply(eval)
            flat_reduced_data = flatten_data(reduced_csv)
        else:
            return output_path
    else:
        flat_reduced_data = flatten_data(reduced_data)
    if order:
        flat_reduced_data = order_columns(
            flat_reduced_data,
            front=['choice', 'total_vote_count', 'choice_count'])
    flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8')
    return output_path