def reduce_subject(
    subject,
    classifications,
    task,
    reducer_name=None,
    workflow_id=None,
    filter=None,
    keywords={}
):
    reduced_data_list = []
    classifications = classifications.drop_duplicates()
    unique_users = classifications['user_name'].unique().shape[0]
    if (filter in FILTER_TYPES) and (unique_users < classifications.shape[0]):
        classifications = classifications.groupby(['user_name'], group_keys=False).apply(FILTER_TYPES[filter])
    data = [unflatten_data(c) for cdx, c in classifications.iterrows()]
    user_ids = [c.user_id for cdx, c in classifications.iterrows()]
    reduction = reducers.reducers[reducer_name](data, user_id=user_ids, **keywords)
    if isinstance(reduction, list):
        for r in reduction:
            reduced_data_list.append(OrderedDict([
                ('subject_id', subject),
                ('workflow_id', workflow_id),
                ('task', task),
                ('reducer', reducer_name),
                ('data', r)
            ]))
    else:
        reduced_data_list.append(OrderedDict([
            ('subject_id', subject),
            ('workflow_id', workflow_id),
            ('task', task),
            ('reducer', reducer_name),
            ('data', reduction)
        ]))
    return reduced_data_list
예제 #2
0
def most_common_text(input_file,
                     output_folder,
                     reducer_key=None,
                     strip_sw=False,
                     csv=None,
                     metadata=None):
    reducer_table = pandas.read_csv(input_file)
    if reducer_key is not None:
        edx = reducer_table.reducer_key == reducer_key
        table_to_loop = reducer_table[edx]
    else:
        table_to_loop = reducer_table
    subject_csv = []
    if metadata is not None:
        subjects = pandas.read_csv(metadata)
        subjects.metadata = subjects.metadata.apply(eval)
    counter = 0
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   max_value=len(table_to_loop))
    pbar.start()
    for _, reduction in table_to_loop.iterrows():
        page_csv = []
        pages = []
        data = unflatten_data(reduction)
        frames = sorted([k for k in data.keys() if 'frame' in k])
        if 'transcribed_lines' not in data:
            # this was reduced before v3.4, add in additional data
            number_of_lines = 0
            low_consensus_lines = 0
            for frame in frames:
                for line in data[frame]:
                    _, consensus_text = consensus_score(line['clusters_text'])
                    line['low_consensus'] = line['consensus_score'] < 3
                    line['consensus_text'] = consensus_text
                    line.setdefault('user_ids', None)
                    if line['low_consensus']:
                        low_consensus_lines += 1
                    number_of_lines += 1
            data['transcribed_lines'] = number_of_lines
            data['low_consensus_lines'] = low_consensus_lines
        data.setdefault('parameters', None)
        data.setdefault('reducer', None)
        subject_row = OrderedDict([
            ('zooniverse_subject_id', reduction.subject_id),
            ('number_of_pages', len(frames)),
            ('transcribed_lines', data['transcribed_lines']),
            ('low_consensus_lines', data['low_consensus_lines']),
            ('reducer', data['reducer']),
            ('reducer_paramters', data['parameters'])
        ])
        if metadata is not None:
            idx = (subjects.subject_id == reduction.subject_id) & (
                subjects.workflow_id == reduction.workflow_id)
            if idx.sum() > 0:
                subject_row['metadata'] = subjects[idx].iloc[0].metadata
        subject_csv.append(subject_row)
        line_counter = 0
        for frame in frames:
            page_number = int(frame[-1]) + 1
            lines = []
            for line in data[frame]:
                line_counter += 1
                if 'consensus_text' not in line:
                    # One version of aggregation defined `transcribed_lines` but not `consensus_text`
                    _, consensus_text = consensus_score(line['clusters_text'])
                    line['consensus_text'] = consensus_text
                line.setdefault('low_consensus', line['consensus_score'] < 3)
                line.setdefault('user_ids', None)
                text = line['consensus_text']
                if strip_sw:
                    text = text.replace('<sw-', '<')
                    text = text.replace('</sw-', '</')
                lines.append(text)
                page_row = OrderedDict([
                    ('line_number', line_counter),
                    ('page_number', page_number),
                    ('column_number', line['gutter_label'] + 1),
                    ('text', text), ('slope', line['line_slope']),
                    ('consensus_score', line['consensus_score']),
                    ('number_transcribers', line['number_views']),
                    ('low_consensus', line['low_consensus']),
                    ('start', {
                        'x': line['clusters_x'][0],
                        'y': line['clusters_y'][0],
                    }),
                    ('end', {
                        'x': line['clusters_x'][1],
                        'y': line['clusters_y'][1],
                    }), ('user_ids', line['user_ids'])
                ])
                page_csv.append(page_row)
            pages.append('\n'.join(lines))
        subject_dir = os.path.join(output_folder, str(reduction.subject_id))
        if not os.path.isdir(subject_dir):
            os.mkdir(subject_dir)
        transcription = '\n\n'.join(pages)
        with open(os.path.join(subject_dir, 'transcription.txt'),
                  'w') as transcription_out:
            transcription_out.write(transcription)
        page_dataframe = json_normalize(page_csv)
        page_csv_out = os.path.join(subject_dir, 'line_metadata.csv')
        page_dataframe.to_csv(page_csv_out, index=False)
        with open(os.path.join(subject_dir, 'aggergation_data.json'),
                  'w') as json_out:
            json.dump(data, json_out, cls=MyEncoder, indent=2)
        counter += 1
        pbar.update(counter)
    subject_dataframe = json_normalize(subject_csv)
    subject_csv_out = os.path.join(output_folder, 'subject_metadata.csv')
    subject_dataframe.to_csv(subject_csv_out, index=False)
    pbar.finish()
 def test_unflatten_data(self):
     '''Test unflattening of data with out the renest keyword'''
     result = csv_utils.unflatten_data(flat_row, renest=False)
     self.assertDictEqual(result, expected_unflatten)
예제 #4
0
def reduce_csv(extracted_csv,
               reducer_config,
               filter='first',
               output_name='reductions',
               output_dir=os.path.abspath('.'),
               order=False,
               stream=False):
    extracted_csv = get_file_instance(extracted_csv)
    with extracted_csv as extracted_csv_in:
        extracted = pandas.read_csv(extracted_csv_in,
                                    infer_datetime_format=True,
                                    parse_dates=['created_at'],
                                    encoding='utf-8')

    extracted.sort_values(['subject_id', 'created_at'], inplace=True)
    resume = False
    subjects = extracted.subject_id.unique()
    tasks = extracted.task.unique()
    workflow_id = extracted.workflow_id.iloc[0]

    reducer_config = get_file_instance(reducer_config)
    with reducer_config as config:
        config_yaml = yaml.load(config, Loader=yaml.SafeLoader)

    assert (len(config_yaml['reducer_config']) == 1
            ), 'There must be only one reducer in the config file.'
    for key, value in config_yaml['reducer_config'].items():
        reducer_name = key
        keywords = value
    assert (reducer_name in reducers.reducers
            ), 'The reducer in the config files does not exist.'

    output_base_name, output_ext = os.path.splitext(output_name)
    output_path = os.path.join(
        output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name))

    if stream:
        if os.path.isfile(output_path):
            print('resuming from last run')
            resume = True
            with open(output_path, 'r', encoding='utf-8') as reduced_file:
                reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8')
                subjects = np.setdiff1d(subjects, reduced_csv.subject_id)

    blank_reduced_data = OrderedDict([('subject_id', []), ('workflow_id', []),
                                      ('task', []), ('reducer', []),
                                      ('data', [])])

    reduced_data = copy.deepcopy(blank_reduced_data)

    widgets = [
        'Reducing: ',
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]

    pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(subjects))
    pbar.start()
    for sdx, subject in enumerate(subjects):
        idx = extracted.subject_id == subject
        for task in tasks:
            jdx = extracted.task == task
            classifications = extracted[idx & jdx]
            classifications = classifications.drop_duplicates()
            if filter in FILTER_TYPES:
                classifications = classifications.groupby(
                    ['user_name'],
                    group_keys=False).apply(FILTER_TYPES[filter])
            data = [unflatten_data(c) for cdx, c in classifications.iterrows()]
            reduction = reducers.reducers[reducer_name](data, **keywords)
            if isinstance(reduction, list):
                for r in reduction:
                    reduced_data['subject_id'].append(subject)
                    reduced_data['workflow_id'].append(workflow_id)
                    reduced_data['task'].append(task)
                    reduced_data['reducer'].append(reducer_name)
                    reduced_data['data'].append(r)
            else:
                reduced_data['subject_id'].append(subject)
                reduced_data['workflow_id'].append(workflow_id)
                reduced_data['task'].append(task)
                reduced_data['reducer'].append(reducer_name)
                reduced_data['data'].append(reduction)
        if stream:
            if (sdx == 0) and (not resume):
                pandas.DataFrame(reduced_data).to_csv(output_path,
                                                      mode='w',
                                                      index=False,
                                                      encoding='utf-8')
            else:
                pandas.DataFrame(reduced_data).to_csv(output_path,
                                                      mode='a',
                                                      index=False,
                                                      header=False,
                                                      encoding='utf-8')
            reduced_data = copy.deepcopy(blank_reduced_data)
        pbar.update(sdx + 1)
    pbar.finish()

    if stream:
        reduced_csv = pandas.read_csv(output_path, encoding='utf-8')
        if 'data' in reduced_csv:
            reduced_csv.data = reduced_csv.data.apply(eval)
            flat_reduced_data = flatten_data(reduced_csv)
        else:
            return output_path
    else:
        flat_reduced_data = flatten_data(reduced_data)
    if order:
        flat_reduced_data = order_columns(
            flat_reduced_data,
            front=['choice', 'total_vote_count', 'choice_count'])
    flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8')
    return output_path
예제 #5
0
 def test_unflatten_array(self):
     '''Test unflattening of data with array as a string'''
     result = csv_utils.unflatten_data(flat_row_array)
     self.assertDictEqual(result, expected_unflatten_array)
예제 #6
0
 def test_unflatten_text_2(self):
     '''Test unflattening of data with text containing brackets'''
     result = csv_utils.unflatten_data(flat_row_text_2)
     self.assertDictEqual(result, expected_unflatten_text_2)
예제 #7
0
 def test_unflatten_text(self):
     '''Test unflattening of data with numbers as text'''
     result = csv_utils.unflatten_data(flat_row_text)
     self.assertDictEqual(result, expected_unflatten_text)