def find_citations_for_cases(filename, case_citations_output_filename, case_citations_fieldnames, legislation_citations_output_filename, legislation_citations_fieldnames): df_eclis = pd.read_csv(filename, usecols=[ECLI, RS_DATE, RS_RELATION]) df_eclis = df_eclis.sort_values(by='date_decision').reset_index(drop=True) eclis = list(df_eclis[ECLI]) if os.getenv('SAMPLE_TEST') == 'TRUE': eclis = eclis[-10:] if citation_type == "inkomende-links": print("Fetching incoming citations from LIDO...") else: print("Fetching outgoing citations from LIDO...") for i, ecli in enumerate(eclis): date = datetime.date.fromisoformat(df_eclis[RS_DATE][i]) relation = df_eclis[RS_RELATION][i] if date >= last_updated: try: case_citations, legislation_citations = find_citations_for_case( remove_spaces_from_ecli(ecli), date, relation, case_citations_fieldnames, legislation_citations_fieldnames) write_incremental_rows(filename=case_citations_output_filename, data=case_citations) write_incremental_rows( filename=legislation_citations_output_filename, data=legislation_citations) except Exception as e: print(f'{ecli} failed: {e}') write_incremental_rows( filename=get_path_raw(CSV_LIDO_ECLIS_FAILED), data={ ECLI: [ecli], RS_DATE: [date], RS_RELATION: [relation] }) if (i + 1) % 100 == 0: print( f'{datetime.datetime.now().isoformat()}: {i + 1}/{len(eclis)} eclis processed.' ) print( f'{datetime.datetime.now().isoformat()}: {i + 1}/{len(eclis)} eclis processed.' )
opinion_counter += 1 # print("\033[95mOPINION\033[0m %s" % datarecord[IDENTIFIER]) write_line_csv(output_path_opinions, datarecord) write_line_csv( output_path_index, { ECLI: datarecord[IDENTIFIER], RS_DATE: datarecord[DATE], RS_RELATION: datarecord[RELATION] }) start = time.time() input_path = DIR_RECHTSPRAAK output_path_cases = get_path_raw(CSV_RS_CASES) output_path_opinions = get_path_raw(CSV_RS_OPINIONS) output_path_index = get_path_raw(CSV_RS_INDEX) parser = argparse.ArgumentParser() parser.add_argument( 'storage', choices=['local', 'aws'], help='location to take input data from and save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n'
parser.add_argument( 'storage', choices=['local', 'aws'], help='location to take input data from and save output data to') parser.add_argument( '-f', '--failed', action='store_true', help='parse list of failed eclis instead of full list of eclis') parser.add_argument('-i', '--incoming', action='store_true', help='fetch incoming citations instead of outgoing') args = parser.parse_args() input_path = get_path_raw( CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX) output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS) output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS) print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('INPUT:\t\t\t\t', basename(input_path)) print( 'OUTPUTS:\t\t\t', f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n' ) storage = Storage(location=args.storage) storage.setup_pipeline( output_paths=[output_path_c_citations, output_path_l_citations], input_path=input_path) citation_type = "inkomende-links" if args.incoming else "uitgaande-links"
def _get_row_processor(self): def row_processor_rs_cases(row): update_items = [] update_set_items = [] # transform set attributes to lists # (domains, predecessor_successor_cases, references_legislation, alternative_sources) for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT, RS_HASVERSION]: if attribute in row: row[attribute] = row[attribute].split(SET_SEP) put_items = [row] return put_items, update_items, update_set_items def row_processor_rs_opinions(row): put_items, update_items, update_set_items = row_processor_rs_cases(row) if ECLI_DECISION in row: update_items.append({ ECLI: row[ECLI_DECISION], ECLI_OPINION: row[ECLI] }) return put_items, update_items, update_set_items def row_processor_li_cases(row): put_items = [] update_set_items = [] if LI_LAW_AREA in row: row[LI_LAW_AREA] = row[LI_LAW_AREA].split(SET_SEP) row_li = {ECLI: row[ECLI]} for key in row.keys() - ECLI: row_li[key + LI] = row[key] update_items = [row_li] return put_items, update_items, update_set_items # @TODO: replace attribute names with global definition def row_processor_c_citations(row): put_items = [] update_items = [] update_set_items = [] if row['keep1'] == 'True': update_set_items = [{ ECLI: row[ECLI], 'cites': row[LIDO_JURISPRUDENTIE] }, { ECLI: row[LIDO_JURISPRUDENTIE], 'cited_by': row[ECLI] }] return put_items, update_items, update_set_items def row_processor_l_citations(row): put_items = [] update_items = [] update_set_items = [{ ECLI: row[ECLI], 'legal_provisions': row[LIDO_ARTIKEL_TITLE] }] return put_items, update_items, update_set_items processor_map = { get_path_processed(CSV_RS_CASES): row_processor_rs_cases, get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions, get_path_processed(CSV_LI_CASES): row_processor_li_cases, get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations, get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations } return processor_map.get(self.path)
return entry def get_ecli(case_number): elements = case_number.split(' ') for e in elements: if 'ECLI:' in e: return e return None # # Main Method start = time.time() output_path = get_path_raw(CSV_LI_CASES) parser = argparse.ArgumentParser() parser.add_argument('storage', choices=['local', 'aws'], help='location to take input data from and save output data to') args = parser.parse_args() print('\n--- PREPARATION ---\n') print('INPUT/OUTPUT DATA STORAGE:\t', args.storage) print('OUTPUTS:\t\t\t', f'{basename(output_path)}\n') storage = Storage(location=args.storage) storage.setup_pipeline(output_paths=[output_path]) last_updated = storage.pipeline_last_updated print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat()) print('\n--- START ---\n') counter = 0
def _get_row_processor(self): def row_processor_rs_cases(row): """ turns csv row (1 RS case) into item(s) for DynamoDB table according to this schema :param row: dict representation of csv row with RS case attributes :return: list of dict representation of items in schema format """ put_items = [] update_set_items = [] # split set attributes (domain, case citations, legislation citations) if RS_SUBJECT in row: for val in row[RS_SUBJECT].split(SET_SEP): put_items.append({ self.pk: row[ECLI], self.sk: ItemType.DOM.value + KEY_SEP + val, key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE], RS_SUBJECT[:-1]: val }) for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]: if attribute in row: update_set_items.append({ self.pk: row[ECLI], self.sk: ItemType.DATA.value, attribute: set(row[attribute].split(SET_SEP)) }) row.pop(attribute) put_items.append({ self.sk: ItemType.DATA.value, key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE], **row }) return put_items, [], update_set_items def row_processor_rs_opinions(row): put_items = [] update_items = [] update_set_items = [] if RS_SUBJECT in row: for val in row[RS_SUBJECT].split(SET_SEP): put_items.append({ self.pk: row[ECLI], self.sk: ItemType.DOM.value + KEY_SEP + val, key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE], RS_SUBJECT[:-1]: val }) # split set attributes (domain, case citations, legislation citations) for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]: if attribute in row: update_set_items.append({ self.pk: row[ECLI], self.sk: ItemType.DATA.value, attribute: set(row[attribute].split(SET_SEP)) }) row.pop(attribute) put_items.append({ self.sk: ItemType.DATA.value, key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE], **row }) if ECLI_DECISION in row: update_items.append({ self.pk: row[ECLI_DECISION], self.sk: ItemType.DATA.value, ECLI_OPINION: row[ECLI] }) return put_items, update_items, update_set_items def row_processor_li_cases(row): put_items = [] update_items = [] update_set_items = [] row_li = dict() for key in row.keys() - ECLI: row_li[key + LI] = row[key] if LI_LAW_AREA in row: for val in row[LI_LAW_AREA].split(SET_SEP): put_items.append({ self.pk: row[ECLI], self.sk: ItemType.DOM_LI.value + KEY_SEP + val, key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE], LI_LAW_AREA[:-1] + LI: val }) update_set_items.append({ self.pk: row[ECLI], self.sk: ItemType.DATA.value, LI_LAW_AREA + LI: set(row[LI_LAW_AREA].split(SET_SEP)) }) row_li.pop(LI_LAW_AREA + LI) update_items.append({ self.pk: row[ECLI], self.sk: ItemType.DATA.value, key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE], **row_li }) return put_items, update_items, update_set_items # @TODO: replace attribute names with global definition def row_processor_c_citations(row): update_set_items = [] if row['keep1'] == 'True': update_set_items = [{ self.pk: row[ECLI], self.sk: ItemType.DATA.value, 'cites': {row[LIDO_JURISPRUDENTIE]} }, { self.pk: row[LIDO_JURISPRUDENTIE], self.sk: ItemType.DATA.value, 'cited_by': {row[ECLI]} }] return [], [], update_set_items def row_processor_l_citations(row): update_set_items = [{ self.pk: row[ECLI], self.sk: ItemType.DATA.value, 'legal_provisions': {row[LIDO_ARTIKEL_TITLE]} }] return [], [], update_set_items processor_map = { get_path_processed(CSV_RS_CASES): row_processor_rs_cases, get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions, get_path_processed(CSV_LI_CASES): row_processor_li_cases, get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations, get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations } return processor_map.get(self.path)
'full_text': format_rs_xml } tool_map_li = { 'Jurisdiction': format_jurisdiction, 'LawArea': format_li_domains, 'IssuingInstitution': format_instance, 'PublicationDate': format_li_date, 'EnactmentDate': format_li_date, 'DateAdded': format_li_date, 'Sources': format_li_list, 'SearchNumbers': format_li_list } tool_maps = { get_path_raw(CSV_RS_CASES): tool_map_rs, get_path_raw(CSV_RS_OPINIONS): tool_map_rs, get_path_raw(CSV_LI_CASES): tool_map_li } field_maps = { get_path_raw(CSV_RS_CASES): MAP_RS, get_path_raw(CSV_RS_OPINIONS): MAP_RS_OPINION, get_path_raw(CSV_LI_CASES): MAP_LI } """ Start processing """ start = time.time() input_paths = [
from data_loading.row_processors.opensearch import OpenSearchRowProcessor from data_loading.clients.dynamodb import DynamoDBClient from data_loading.clients.opensearch import OpenSearchClient from definitions.storage_handler import Storage, CSV_RS_CASES, CSV_LI_CASES, CSV_RS_OPINIONS, CSV_CASE_CITATIONS, \ CSV_LEGISLATION_CITATIONS, get_path_processed, get_path_raw import time import argparse csv.field_size_limit(sys.maxsize) start = time.time() input_paths = [ get_path_processed(CSV_RS_CASES), get_path_processed(CSV_RS_OPINIONS), get_path_processed(CSV_LI_CASES), get_path_raw(CSV_CASE_CITATIONS), get_path_raw(CSV_LEGISLATION_CITATIONS) ] # parse input arguments parser = argparse.ArgumentParser() parser.add_argument( '-partial', '--partial', choices=['ddb', 'os'], help='load data only to either DynamoDB or OpenSearch, not both') parser.add_argument('-delete', '--delete', choices=['ddb', 'os'], help='delete content from DynamoDB table/OpenSearch index') args = parser.parse_args()