def find_citations_for_cases(filename, case_citations_output_filename,
                             case_citations_fieldnames,
                             legislation_citations_output_filename,
                             legislation_citations_fieldnames):
    df_eclis = pd.read_csv(filename, usecols=[ECLI, RS_DATE, RS_RELATION])
    df_eclis = df_eclis.sort_values(by='date_decision').reset_index(drop=True)

    eclis = list(df_eclis[ECLI])

    if os.getenv('SAMPLE_TEST') == 'TRUE':
        eclis = eclis[-10:]

    if citation_type == "inkomende-links":
        print("Fetching incoming citations from LIDO...")
    else:
        print("Fetching outgoing citations from LIDO...")

    for i, ecli in enumerate(eclis):
        date = datetime.date.fromisoformat(df_eclis[RS_DATE][i])
        relation = df_eclis[RS_RELATION][i]
        if date >= last_updated:
            try:
                case_citations, legislation_citations = find_citations_for_case(
                    remove_spaces_from_ecli(ecli), date, relation,
                    case_citations_fieldnames,
                    legislation_citations_fieldnames)
                write_incremental_rows(filename=case_citations_output_filename,
                                       data=case_citations)
                write_incremental_rows(
                    filename=legislation_citations_output_filename,
                    data=legislation_citations)
            except Exception as e:
                print(f'{ecli} failed: {e}')
                write_incremental_rows(
                    filename=get_path_raw(CSV_LIDO_ECLIS_FAILED),
                    data={
                        ECLI: [ecli],
                        RS_DATE: [date],
                        RS_RELATION: [relation]
                    })
        if (i + 1) % 100 == 0:
            print(
                f'{datetime.datetime.now().isoformat()}: {i + 1}/{len(eclis)} eclis processed.'
            )

    print(
        f'{datetime.datetime.now().isoformat()}: {i + 1}/{len(eclis)} eclis processed.'
    )
        opinion_counter += 1
        # print("\033[95mOPINION\033[0m %s" % datarecord[IDENTIFIER])
        write_line_csv(output_path_opinions, datarecord)

    write_line_csv(
        output_path_index, {
            ECLI: datarecord[IDENTIFIER],
            RS_DATE: datarecord[DATE],
            RS_RELATION: datarecord[RELATION]
        })


start = time.time()

input_path = DIR_RECHTSPRAAK
output_path_cases = get_path_raw(CSV_RS_CASES)
output_path_opinions = get_path_raw(CSV_RS_OPINIONS)
output_path_index = get_path_raw(CSV_RS_INDEX)

parser = argparse.ArgumentParser()
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_cases)}, {basename(output_path_opinions)}, {basename(output_path_index)}\n'
parser.add_argument(
    'storage',
    choices=['local', 'aws'],
    help='location to take input data from and save output data to')
parser.add_argument(
    '-f',
    '--failed',
    action='store_true',
    help='parse list of failed eclis instead of full list of eclis')
parser.add_argument('-i',
                    '--incoming',
                    action='store_true',
                    help='fetch incoming citations instead of outgoing')
args = parser.parse_args()

input_path = get_path_raw(
    CSV_LIDO_ECLIS_FAILED) if args.failed else get_path_raw(CSV_RS_INDEX)
output_path_c_citations = get_path_raw(CSV_CASE_CITATIONS)
output_path_l_citations = get_path_raw(CSV_LEGISLATION_CITATIONS)

print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('INPUT:\t\t\t\t', basename(input_path))
print(
    'OUTPUTS:\t\t\t',
    f'{basename(output_path_c_citations)}, {basename(output_path_l_citations)}\n'
)
storage = Storage(location=args.storage)
storage.setup_pipeline(
    output_paths=[output_path_c_citations, output_path_l_citations],
    input_path=input_path)
citation_type = "inkomende-links" if args.incoming else "uitgaande-links"
示例#4
0
    def _get_row_processor(self):
        def row_processor_rs_cases(row):
            update_items = []
            update_set_items = []
            # transform set attributes  to lists
            # (domains, predecessor_successor_cases, references_legislation, alternative_sources)
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT, RS_HASVERSION]:
                if attribute in row:
                    row[attribute] = row[attribute].split(SET_SEP)
            put_items = [row]
            return put_items, update_items, update_set_items

        def row_processor_rs_opinions(row):
            put_items, update_items, update_set_items = row_processor_rs_cases(row)
            if ECLI_DECISION in row:
                update_items.append({
                    ECLI: row[ECLI_DECISION],
                    ECLI_OPINION: row[ECLI]
                })
            return put_items, update_items, update_set_items

        def row_processor_li_cases(row):
            put_items = []
            update_set_items = []
            if LI_LAW_AREA in row:
                row[LI_LAW_AREA] = row[LI_LAW_AREA].split(SET_SEP)
            row_li = {ECLI: row[ECLI]}
            for key in row.keys() - ECLI:
                row_li[key + LI] = row[key]
            update_items = [row_li]
            return put_items, update_items, update_set_items

        # @TODO: replace attribute names with global definition
        def row_processor_c_citations(row):
            put_items = []
            update_items = []
            update_set_items = []
            if row['keep1'] == 'True':
                update_set_items = [{
                    ECLI: row[ECLI],
                    'cites': row[LIDO_JURISPRUDENTIE]
                }, {
                    ECLI: row[LIDO_JURISPRUDENTIE],
                    'cited_by': row[ECLI]
                }]
            return put_items, update_items, update_set_items

        def row_processor_l_citations(row):
            put_items = []
            update_items = []
            update_set_items = [{
                ECLI: row[ECLI],
                'legal_provisions': row[LIDO_ARTIKEL_TITLE]
            }]
            return put_items, update_items, update_set_items

        processor_map = {
            get_path_processed(CSV_RS_CASES): row_processor_rs_cases,
            get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions,
            get_path_processed(CSV_LI_CASES): row_processor_li_cases,
            get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations,
            get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations
        }
        return processor_map.get(self.path)
示例#5
0
    return entry


def get_ecli(case_number):
    elements = case_number.split(' ')
    for e in elements:
        if 'ECLI:' in e:
            return e
    return None

# # Main Method

start = time.time()

output_path = get_path_raw(CSV_LI_CASES)

parser = argparse.ArgumentParser()
parser.add_argument('storage', choices=['local', 'aws'], help='location to take input data from and save output data to')
args = parser.parse_args()
print('\n--- PREPARATION ---\n')
print('INPUT/OUTPUT DATA STORAGE:\t', args.storage)
print('OUTPUTS:\t\t\t', f'{basename(output_path)}\n')
storage = Storage(location=args.storage)
storage.setup_pipeline(output_paths=[output_path])
last_updated = storage.pipeline_last_updated
print('\nSTART DATE (LAST UPDATE):\t', last_updated.isoformat())

print('\n--- START ---\n')

counter = 0
    def _get_row_processor(self):
        def row_processor_rs_cases(row):
            """
            turns csv row (1 RS case) into item(s) for DynamoDB table according to this schema
            :param row: dict representation of csv row with RS case attributes
            :return: list of dict representation of items in schema format
            """
            put_items = []
            update_set_items = []
            # split set attributes (domain, case citations, legislation citations)
            if RS_SUBJECT in row:
                for val in row[RS_SUBJECT].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                        RS_SUBJECT[:-1]: val
                    })
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]:
                if attribute in row:
                    update_set_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DATA.value,
                        attribute: set(row[attribute].split(SET_SEP))
                    })
                    row.pop(attribute)
            put_items.append({
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                **row
            })
            return put_items, [], update_set_items

        def row_processor_rs_opinions(row):
            put_items = []
            update_items = []
            update_set_items = []
            if RS_SUBJECT in row:
                for val in row[RS_SUBJECT].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE],
                        RS_SUBJECT[:-1]: val
                    })
            # split set attributes (domain, case citations, legislation citations)
            for attribute in [RS_RELATION, RS_REFERENCES, RS_SUBJECT]:
                if attribute in row:
                    update_set_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DATA.value,
                        attribute: set(row[attribute].split(SET_SEP))
                    })
                    row.pop(attribute)
            put_items.append({
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.OPI.value + KEY_SEP + row[RS_DATE],
                **row
            })
            if ECLI_DECISION in row:
                update_items.append({
                    self.pk: row[ECLI_DECISION],
                    self.sk: ItemType.DATA.value,
                    ECLI_OPINION: row[ECLI]
                })
            return put_items, update_items, update_set_items

        def row_processor_li_cases(row):
            put_items = []
            update_items = []
            update_set_items = []
            row_li = dict()
            for key in row.keys() - ECLI:
                row_li[key + LI] = row[key]
            if LI_LAW_AREA in row:
                for val in row[LI_LAW_AREA].split(SET_SEP):
                    put_items.append({
                        self.pk: row[ECLI],
                        self.sk: ItemType.DOM_LI.value + KEY_SEP + val,
                        key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                        LI_LAW_AREA[:-1] + LI: val
                    })
                update_set_items.append({
                    self.pk: row[ECLI],
                    self.sk: ItemType.DATA.value,
                    LI_LAW_AREA + LI: set(row[LI_LAW_AREA].split(SET_SEP))
                })
                row_li.pop(LI_LAW_AREA + LI)
            update_items.append({
                self.pk: row[ECLI],
                self.sk: ItemType.DATA.value,
                key_sdd: DataSource.RS.value + KEY_SEP + DocType.DEC.value + KEY_SEP + row[RS_DATE],
                **row_li
            })
            return put_items, update_items, update_set_items

        # @TODO: replace attribute names with global definition
        def row_processor_c_citations(row):
            update_set_items = []
            if row['keep1'] == 'True':
                update_set_items = [{
                    self.pk: row[ECLI],
                    self.sk: ItemType.DATA.value,
                    'cites': {row[LIDO_JURISPRUDENTIE]}
                }, {
                    self.pk: row[LIDO_JURISPRUDENTIE],
                    self.sk: ItemType.DATA.value,
                    'cited_by': {row[ECLI]}
                }]
            return [], [], update_set_items

        def row_processor_l_citations(row):
            update_set_items = [{
                self.pk: row[ECLI],
                self.sk: ItemType.DATA.value,
                'legal_provisions': {row[LIDO_ARTIKEL_TITLE]}
            }]
            return [], [], update_set_items

        processor_map = {
            get_path_processed(CSV_RS_CASES): row_processor_rs_cases,
            get_path_processed(CSV_RS_OPINIONS): row_processor_rs_opinions,
            get_path_processed(CSV_LI_CASES): row_processor_li_cases,
            get_path_raw(CSV_CASE_CITATIONS): row_processor_c_citations,
            get_path_raw(CSV_LEGISLATION_CITATIONS): row_processor_l_citations
        }
        return processor_map.get(self.path)
    'full_text': format_rs_xml
}

tool_map_li = {
    'Jurisdiction': format_jurisdiction,
    'LawArea': format_li_domains,
    'IssuingInstitution': format_instance,
    'PublicationDate': format_li_date,
    'EnactmentDate': format_li_date,
    'DateAdded': format_li_date,
    'Sources': format_li_list,
    'SearchNumbers': format_li_list
}

tool_maps = {
    get_path_raw(CSV_RS_CASES): tool_map_rs,
    get_path_raw(CSV_RS_OPINIONS): tool_map_rs,
    get_path_raw(CSV_LI_CASES): tool_map_li
}

field_maps = {
    get_path_raw(CSV_RS_CASES): MAP_RS,
    get_path_raw(CSV_RS_OPINIONS): MAP_RS_OPINION,
    get_path_raw(CSV_LI_CASES): MAP_LI
}
"""
Start processing
"""
start = time.time()

input_paths = [
from data_loading.row_processors.opensearch import OpenSearchRowProcessor
from data_loading.clients.dynamodb import DynamoDBClient
from data_loading.clients.opensearch import OpenSearchClient
from definitions.storage_handler import Storage, CSV_RS_CASES, CSV_LI_CASES, CSV_RS_OPINIONS, CSV_CASE_CITATIONS, \
    CSV_LEGISLATION_CITATIONS, get_path_processed, get_path_raw
import time
import argparse
csv.field_size_limit(sys.maxsize)

start = time.time()

input_paths = [
    get_path_processed(CSV_RS_CASES),
    get_path_processed(CSV_RS_OPINIONS),
    get_path_processed(CSV_LI_CASES),
    get_path_raw(CSV_CASE_CITATIONS),
    get_path_raw(CSV_LEGISLATION_CITATIONS)
]

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument(
    '-partial',
    '--partial',
    choices=['ddb', 'os'],
    help='load data only to either DynamoDB or OpenSearch, not both')
parser.add_argument('-delete',
                    '--delete',
                    choices=['ddb', 'os'],
                    help='delete content from DynamoDB table/OpenSearch index')
args = parser.parse_args()