row_date = row[DATE_COLUMN] fix_date = [it['to'] for it in FIX_BAD_DATE if it['from'] == row_date] if len(fix_date) > 0: row_date = fix_date[0] try: row[DATE_COLUMN] = convert_to_uniform_date(row_date) except ValueError as e: # this is necessary as there are some invalid dates in the spread sheet logging.warning( 'Failed to convert: %s to a uniform date, Removing it. Full error: %s' % (row_date, e) ) del row[DATE_COLUMN] row[SOURCE_COLUMN] = 'the-marker' row[PROOF_COLUMN] = 'https://www.themarker.com/career/EXT-1.2577328' return row def convert_to_uniform_date(raw_date): for format in DATE_FORMATS: try: return datetime.datetime.strptime(raw_date, format).date() except ValueError: pass raise ValueError("Failed to convert date using all available date formats") process(process_row=process_row, modify_datapackage=modify_datapackage)
'has_article_46', {}).get( 'activities', []) ] } }, ] }, ] row['others'] = [x for x in all_districts if x != district] return row def modify_datapackage(dp, *_): dp['resources'][0]['schema']['fields'].extend([{ 'name': 'charts', 'type': 'array', 'es:itemType': 'object', 'es:index': False }, { 'name': 'others', 'type': 'array', 'es:index': False, 'es:itemType': 'string', }]) return dp if __name__ == '__main__': process(modify_datapackage=modify_datapackage, process_row=process_row)
import os import json import tempfile from datapackage_pipelines.wrapper import process def modify_datapackage(dp, parameters, _): os.makedirs(parameters['out-path'], exist_ok=True) if dp: filename = os.path.join(parameters['out-path'], 'datapackage.json') with open(filename + '.tmp', 'w') as tmp: json.dump(dp, tmp) os.rename(filename + '.tmp', filename) return dp if __name__ == '__main__': process(modify_datapackage=modify_datapackage)
return try: for k, v in row.items(): if k in ['sensitive_order']: row[k] = boolean(v) elif k in ['budget_code']: row[k] = budget_code(v) elif k in ['end_date', 'order_date', 'start_date']: row[k] = date(v) elif k in ['volume']: row[k] = Decimal(v.replace(',', '') if v is not None and v != '' else 0) elif k in ['executed']: row[k] = Decimal(v.replace(',', '') if v is not None and v != '' else 0) elif isinstance(v, str): row[k] = v.strip() assert row['order_id'] stats['good-lines'] += 1 except Exception as e: stats['bad-lines'] += 1 logging.exception('ERROR in row %d: %r', row_index, row) bad_rows[row['report-url']] += 1 return elif resource_index == 1: # the errors row['report-rows'] = total_rows.get(row['report-url']) row['report-bad-rows'] = bad_rows.get(row['report-url']) return row process(process_row=process_row)
import os import shutil import logging from datapackage_pipelines.wrapper import process def cleanup(dp, parameters, *_): dir_to_clean = parameters['dirs_to_clean'] for dir_name in dir_to_clean: abs_path = os.path.abspath(dir_name) logging.info('Cleaning artifact: {}'.format(abs_path)) try: shutil.rmtree(abs_path) except FileNotFoundError: logging.warning('No artifact to clean: {}'.format(abs_path)) return dp if __name__ == '__main__': process(modify_datapackage=cleanup)