def reduce_subject( subject, classifications, task, reducer_name=None, workflow_id=None, filter=None, keywords={} ): reduced_data_list = [] classifications = classifications.drop_duplicates() unique_users = classifications['user_name'].unique().shape[0] if (filter in FILTER_TYPES) and (unique_users < classifications.shape[0]): classifications = classifications.groupby(['user_name'], group_keys=False).apply(FILTER_TYPES[filter]) data = [unflatten_data(c) for cdx, c in classifications.iterrows()] user_ids = [c.user_id for cdx, c in classifications.iterrows()] reduction = reducers.reducers[reducer_name](data, user_id=user_ids, **keywords) if isinstance(reduction, list): for r in reduction: reduced_data_list.append(OrderedDict([ ('subject_id', subject), ('workflow_id', workflow_id), ('task', task), ('reducer', reducer_name), ('data', r) ])) else: reduced_data_list.append(OrderedDict([ ('subject_id', subject), ('workflow_id', workflow_id), ('task', task), ('reducer', reducer_name), ('data', reduction) ])) return reduced_data_list
def most_common_text(input_file, output_folder, reducer_key=None, strip_sw=False, csv=None, metadata=None): reducer_table = pandas.read_csv(input_file) if reducer_key is not None: edx = reducer_table.reducer_key == reducer_key table_to_loop = reducer_table[edx] else: table_to_loop = reducer_table subject_csv = [] if metadata is not None: subjects = pandas.read_csv(metadata) subjects.metadata = subjects.metadata.apply(eval) counter = 0 pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(table_to_loop)) pbar.start() for _, reduction in table_to_loop.iterrows(): page_csv = [] pages = [] data = unflatten_data(reduction) frames = sorted([k for k in data.keys() if 'frame' in k]) if 'transcribed_lines' not in data: # this was reduced before v3.4, add in additional data number_of_lines = 0 low_consensus_lines = 0 for frame in frames: for line in data[frame]: _, consensus_text = consensus_score(line['clusters_text']) line['low_consensus'] = line['consensus_score'] < 3 line['consensus_text'] = consensus_text line.setdefault('user_ids', None) if line['low_consensus']: low_consensus_lines += 1 number_of_lines += 1 data['transcribed_lines'] = number_of_lines data['low_consensus_lines'] = low_consensus_lines data.setdefault('parameters', None) data.setdefault('reducer', None) subject_row = OrderedDict([ ('zooniverse_subject_id', reduction.subject_id), ('number_of_pages', len(frames)), ('transcribed_lines', data['transcribed_lines']), ('low_consensus_lines', data['low_consensus_lines']), ('reducer', data['reducer']), ('reducer_paramters', data['parameters']) ]) if metadata is not None: idx = (subjects.subject_id == reduction.subject_id) & ( subjects.workflow_id == reduction.workflow_id) if idx.sum() > 0: subject_row['metadata'] = subjects[idx].iloc[0].metadata subject_csv.append(subject_row) line_counter = 0 for frame in frames: page_number = int(frame[-1]) + 1 lines = [] for line in data[frame]: line_counter += 1 if 'consensus_text' not in line: # One version of aggregation defined `transcribed_lines` but not `consensus_text` _, consensus_text = consensus_score(line['clusters_text']) line['consensus_text'] = consensus_text line.setdefault('low_consensus', line['consensus_score'] < 3) line.setdefault('user_ids', None) text = line['consensus_text'] if strip_sw: text = text.replace('<sw-', '<') text = text.replace('</sw-', '</') lines.append(text) page_row = OrderedDict([ ('line_number', line_counter), ('page_number', page_number), ('column_number', line['gutter_label'] + 1), ('text', text), ('slope', line['line_slope']), ('consensus_score', line['consensus_score']), ('number_transcribers', line['number_views']), ('low_consensus', line['low_consensus']), ('start', { 'x': line['clusters_x'][0], 'y': line['clusters_y'][0], }), ('end', { 'x': line['clusters_x'][1], 'y': line['clusters_y'][1], }), ('user_ids', line['user_ids']) ]) page_csv.append(page_row) pages.append('\n'.join(lines)) subject_dir = os.path.join(output_folder, str(reduction.subject_id)) if not os.path.isdir(subject_dir): os.mkdir(subject_dir) transcription = '\n\n'.join(pages) with open(os.path.join(subject_dir, 'transcription.txt'), 'w') as transcription_out: transcription_out.write(transcription) page_dataframe = json_normalize(page_csv) page_csv_out = os.path.join(subject_dir, 'line_metadata.csv') page_dataframe.to_csv(page_csv_out, index=False) with open(os.path.join(subject_dir, 'aggergation_data.json'), 'w') as json_out: json.dump(data, json_out, cls=MyEncoder, indent=2) counter += 1 pbar.update(counter) subject_dataframe = json_normalize(subject_csv) subject_csv_out = os.path.join(output_folder, 'subject_metadata.csv') subject_dataframe.to_csv(subject_csv_out, index=False) pbar.finish()
def test_unflatten_data(self): '''Test unflattening of data with out the renest keyword''' result = csv_utils.unflatten_data(flat_row, renest=False) self.assertDictEqual(result, expected_unflatten)
def reduce_csv(extracted_csv, reducer_config, filter='first', output_name='reductions', output_dir=os.path.abspath('.'), order=False, stream=False): extracted_csv = get_file_instance(extracted_csv) with extracted_csv as extracted_csv_in: extracted = pandas.read_csv(extracted_csv_in, infer_datetime_format=True, parse_dates=['created_at'], encoding='utf-8') extracted.sort_values(['subject_id', 'created_at'], inplace=True) resume = False subjects = extracted.subject_id.unique() tasks = extracted.task.unique() workflow_id = extracted.workflow_id.iloc[0] reducer_config = get_file_instance(reducer_config) with reducer_config as config: config_yaml = yaml.load(config, Loader=yaml.SafeLoader) assert (len(config_yaml['reducer_config']) == 1 ), 'There must be only one reducer in the config file.' for key, value in config_yaml['reducer_config'].items(): reducer_name = key keywords = value assert (reducer_name in reducers.reducers ), 'The reducer in the config files does not exist.' output_base_name, output_ext = os.path.splitext(output_name) output_path = os.path.join( output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name)) if stream: if os.path.isfile(output_path): print('resuming from last run') resume = True with open(output_path, 'r', encoding='utf-8') as reduced_file: reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8') subjects = np.setdiff1d(subjects, reduced_csv.subject_id) blank_reduced_data = OrderedDict([('subject_id', []), ('workflow_id', []), ('task', []), ('reducer', []), ('data', [])]) reduced_data = copy.deepcopy(blank_reduced_data) widgets = [ 'Reducing: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(subjects)) pbar.start() for sdx, subject in enumerate(subjects): idx = extracted.subject_id == subject for task in tasks: jdx = extracted.task == task classifications = extracted[idx & jdx] classifications = classifications.drop_duplicates() if filter in FILTER_TYPES: classifications = classifications.groupby( ['user_name'], group_keys=False).apply(FILTER_TYPES[filter]) data = [unflatten_data(c) for cdx, c in classifications.iterrows()] reduction = reducers.reducers[reducer_name](data, **keywords) if isinstance(reduction, list): for r in reduction: reduced_data['subject_id'].append(subject) reduced_data['workflow_id'].append(workflow_id) reduced_data['task'].append(task) reduced_data['reducer'].append(reducer_name) reduced_data['data'].append(r) else: reduced_data['subject_id'].append(subject) reduced_data['workflow_id'].append(workflow_id) reduced_data['task'].append(task) reduced_data['reducer'].append(reducer_name) reduced_data['data'].append(reduction) if stream: if (sdx == 0) and (not resume): pandas.DataFrame(reduced_data).to_csv(output_path, mode='w', index=False, encoding='utf-8') else: pandas.DataFrame(reduced_data).to_csv(output_path, mode='a', index=False, header=False, encoding='utf-8') reduced_data = copy.deepcopy(blank_reduced_data) pbar.update(sdx + 1) pbar.finish() if stream: reduced_csv = pandas.read_csv(output_path, encoding='utf-8') if 'data' in reduced_csv: reduced_csv.data = reduced_csv.data.apply(eval) flat_reduced_data = flatten_data(reduced_csv) else: return output_path else: flat_reduced_data = flatten_data(reduced_data) if order: flat_reduced_data = order_columns( flat_reduced_data, front=['choice', 'total_vote_count', 'choice_count']) flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8') return output_path
def test_unflatten_array(self): '''Test unflattening of data with array as a string''' result = csv_utils.unflatten_data(flat_row_array) self.assertDictEqual(result, expected_unflatten_array)
def test_unflatten_text_2(self): '''Test unflattening of data with text containing brackets''' result = csv_utils.unflatten_data(flat_row_text_2) self.assertDictEqual(result, expected_unflatten_text_2)
def test_unflatten_text(self): '''Test unflattening of data with numbers as text''' result = csv_utils.unflatten_data(flat_row_text) self.assertDictEqual(result, expected_unflatten_text)