Пример #1
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        with open(res.file_path, 'r') as csvfile:
            for row in csv.DictReader(csvfile, delimiter='\t'):
                parse_row(emitter, row)
    emitter.finalize()
Пример #2
0
def parse(context, data):
    emitter = EntityEmitter(context)
    references = defaultdict(list)
    with context.http.rehash(data) as res:
        xls = xlrd.open_workbook(res.file_path)
        ws = xls.sheet_by_index(0)
        headers = [slugify(h, sep='_') for h in ws.row_values(0)]
        for r in range(1, ws.nrows):
            row = ws.row(r)
            row = dict(zip(headers, row))
            for header, cell in row.items():
                if cell.ctype == 2:
                    row[header] = str(int(cell.value))
                elif cell.ctype == 3:
                    date = xldate_as_datetime(cell.value, xls.datemode)
                    row[header] = date.isoformat()
                elif cell.ctype == 0:
                    row[header] = None
                row[header] = cell.value

            reference = clean_reference(row.get('reference'))
            references[reference].append(row)

    for ref, rows in references.items():
        parse_reference(emitter, ref, rows)
    emitter.finalize()
Пример #3
0
def parse(context, data):
    emitter = EntityEmitter(context)
    entity = emitter.make('LegalEntity')

    name = data.get('SUPP_NAME')
    ent_id = data.get('SUPP_ID')
    reason = data.get('DEBAR_REASON')
    country = data.get('COUNTRY_NAME')
    city = data.get('SUPP_CITY')
    address = data.get('SUPP_ADDR')
    start_date = data.get('DEBAR_FROM_DATE')
    end_date = data.get('DEBAR_TO_DATE')


    entity.make_id(name, ent_id, country)

    names = clean_name(name)
    entity.add('name', names[0])
    entity.add('address', address)
    entity.add('address', city)
    entity.add('country', normalize_country(country))
    for name in names[1:]:
        entity.add('alias', name)

    sanction = emitter.make('Sanction')
    sanction.make_id('Sanction', entity.id)
    sanction.add('authority', 'World Bank Debarrment')
    sanction.add('program', reason)
    sanction.add('startDate', clean_date(start_date))
    sanction.add('endDate', clean_date(end_date))
    sanction.add('sourceUrl', SOURCE)
    emitter.emit(entity)
    emitter.emit(sanction)

    emitter.finalize()
Пример #4
0
def parse(context, data):
    emitter = EntityEmitter(context)
    url = data.get('url')
    country = normalize_country(data.get('country'))
    with context.http.rehash(data) as res:
        doc = res.html
        # updated_at = doc.findtext('.//span[@id="lastUpdateDate"]')
        output = doc.find('.//div[@id="countryOutput"]')
        if output is None:
            return
        # component = None
        for row in output.findall('.//li'):
            # next_comp = row.findtext('./td[@class="componentName"]/strong')
            # if next_comp is not None:
            #     component = next_comp
            #     continue
            function = element_text(row.find('.//span[@class="title"]'))
            if function is None:
                continue
            name = element_text(row.find('.//span[@class="cos_name"]'))
            if name is None:
                continue

            person = emitter.make('Person')
            person.make_id(country, name, function)
            person.add('name', name)
            person.add('country', country)
            person.add('position', function)
            person.add('sourceUrl', url)
            emitter.emit(person)
Пример #5
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for node in res.xml.findall('.//INDIVIDUAL'):
            parse_individual(emitter, node)

        for node in res.xml.findall('.//ENTITY'):
            parse_entity(emitter, node)
    emitter.finalize()
Пример #6
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for row in extract_rows(res.xml):
            if len(row) == 5:
                parse_organisation(emitter, row)
            if len(row) == 9:
                parse_individual(emitter, row)
    emitter.finalize()
Пример #7
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for table in res.html.findall('.//table'):
            if 'List of Debarred' not in table.get('summary', ''):
                continue
            rows = table.findall('.//tr')
            for row in rows:
                tds = row.findall('./td')
                if len(tds) != 6:
                    continue
                values = [clean_value(td) for td in tds]
                entity = emitter.make('LegalEntity')
                entity.make_id(*values)

                names = clean_name(values[0])
                if not len(names):
                    context.log.warning("No name: %r", values)
                    continue

                entity.add('name', names[0])
                entity.add('address', values[1])
                entity.add('country', normalize_country(values[2]))
                for name in names[1:]:
                    entity.add('alias', name)

                sanction = emitter.make('Sanction')
                sanction.make_id('Sanction', entity.id)
                sanction.add('authority', 'World Bank Debarrment')
                sanction.add('program', values[5])
                sanction.add('startDate', clean_date(values[3]))
                sanction.add('endDate', clean_date(values[4]))
                sanction.add('sourceUrl', data.get('url'))
                emitter.emit(entity)
                emitter.emit(sanction)
Пример #8
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        doc = res.xml
        for distinct_party in doc.findall(qpath('DistinctParty')):
            parse_party(emitter, doc, distinct_party)

        for entry in doc.findall(qpath('SanctionsEntry')):
            parse_entry(emitter, doc, entry)

        for relation in doc.findall(qpath('ProfileRelationship')):
            parse_relation(emitter, doc, relation)

    emitter.finalize()
Пример #9
0
def parse(context, data):
    emitter = EntityEmitter(context)
    groups = defaultdict(list)
    with context.http.rehash(data) as res:
        with open(res.file_path, 'r', encoding='iso-8859-1') as csvfile:
            # ignore first line
            next(csvfile)
            for row in csv.DictReader(csvfile):
                group = row.pop('Group ID')
                if group is not None:
                    groups[group].append(row)

    for group, rows in groups.items():
        parse_entry(emitter, group, rows)

    emitter.finalize()
Пример #10
0
def parse(context, data):
    emitter = EntityEmitter(context)

    url = data.get('url')

    context.log.info("Interpol Red Notices URL: %s", url)

    with context.http.get(url) as result:

        context.log.info("result.text=%s", result.text)

        dict = json.loads(result.text)
        name = jointext(dict['forename'], dict['name'])
        if name is None or name == 'Identity unknown':
            return
        entity = emitter.make('Person')
        entity.make_id(url)
        entity.add('name', name)
        entity.add('sourceUrl', url)
        description = dict['distinguishing_marks']
        entity.add('description', description)
        entity.add('keywords', 'REDNOTICE')
        entity.add('keywords', 'CRIME')

        if ', ' in name:
            last, first = name.split(', ', 1)
            entity.add('alias', jointext(first, last))

        warrants = dict['arrest_warrants']
        summary = ''
        for warrant in warrants:
            issuingCountryId = jointext('[',
                                        warrant['issuing_country_id'],
                                        ']',
                                        sep='')
            charge = jointext(issuingCountryId, warrant['charge'])
            summary = jointext(summary, charge, sep='\r\n')

        entity.add('summary', summary)
        entity.add('lastName', dict['name'])
        entity.add('firstName', dict['forename'])
        entity.add('nationality', dict['nationalities'])
        entity.add('gender', SEXES[dict['sex_id']])
        entity.add('birthDate', dict['date_of_birth'])
        entity.add('birthPlace', dict['place_of_birth'])

        emitter.emit(entity)
Пример #11
0
def seco_parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        updated_at = res.xml.getroot().get('date')

        programs = {}
        for sanc in res.xml.findall('.//sanctions-program'):
            ssid = sanc.find('./sanctions-set').get('ssid')
            programs[ssid] = sanc.findtext('./program-name[@lang="eng"]')

        places = {}
        for place in res.xml.findall('.//place'):
            places[place.get('ssid')] = parse_address(place)

        for target in res.xml.findall('./target'):
            parse_entry(emitter, target, programs, places, updated_at)
    emitter.finalize()
Пример #12
0
def parse(context, data):
    emitter = EntityEmitter(context)
    country = data.get('country', {}).get('code')
    with context.http.rehash(data) as res:
        persons = {}
        for person in res.json.get('persons', []):
            ep_id, ftm_id = parse_person(emitter, person, country)
            persons[ep_id] = ftm_id

        organizations = {}
        for organization in res.json.get('organizations', []):
            ep_id, ftm_id = parse_organization(emitter, organization, country)
            organizations[ep_id] = ftm_id

        for membership in res.json.get('memberships', []):
            parse_membership(emitter, membership, persons, organizations)
    emitter.finalize()
Пример #13
0
def parse_notice(context, data):
    with context.http.rehash(data) as res:
        res = res.json
        first_name = res['forename'] or ''
        last_name = res['name'] or ''
        dob = res['date_of_birth']
        nationalities = res['nationalities']
        place_of_birth = res['place_of_birth']
        warrants = [
            (warrant['charge'], warrant['issuing_country_id'])
            for warrant in res['arrest_warrants']  # noqa
        ]
        gender = SEXES.get(res['sex_id'])
        emitter = EntityEmitter(context)
        entity = emitter.make('Person')
        entity.make_id(first_name, last_name, res['entity_id'])
        entity.add('name', first_name + ' ' + last_name)
        entity.add('firstName', first_name)
        entity.add('lastName', last_name)
        entity.add('nationality', nationalities)
        for charge, country in warrants:
            entity.add('program', country)
            entity.add('summary', charge)
        entity.add('gender', gender)
        entity.add('birthPlace', place_of_birth)
        entity.add('birthDate', parse_date(dob))
        entity.add('sourceUrl', res['_links']['self']['href'])
        entity.add('keywords', 'REDNOTICE')
        entity.add('keywords', 'CRIME')
        emitter.emit(entity)
        emitter.finalize()
Пример #14
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as result:
        doc = result.html
        name = element_text(doc.find('.//div[@class="nom_fugitif_wanted"]'))
        if name is None or name == 'Identity unknown':
            return
        entity = emitter.make('Person')
        entity.make_id(data.get('url'))
        entity.add('name', name)
        entity.add('sourceUrl', data.get('url'))
        wanted = element_text(doc.find('.//span[@class="nom_fugitif_wanted_small"]'))  # noqa
        entity.add('program', wanted)
        entity.add('keywords', 'REDNOTICE')
        entity.add('keywords', 'CRIME')

        if ', ' in name:
            last, first = name.split(', ', 1)
            entity.add('alias', jointext(first, last))

        for row in doc.findall('.//div[@class="bloc_detail"]//tr'):
            title, value = row.findall('./td')
            name = slugify(element_text(title), sep='_')
            value = element_text(value)
            if value is None:
                continue
            if name == 'charges':
                entity.add('summary', value)
            elif name == 'present_family_name':
                entity.add('lastName', value)
            elif name == 'forename':
                entity.add('firstName', value)
            elif name == 'nationality':
                for country in value.split(', '):
                    entity.add('nationality', country)
            elif name == 'sex':
                entity.add('gender', SEXES[value])
            elif name == 'date_of_birth':
                entity.add('birthDate', value.split('(')[0])
            elif name == 'place_of_birth':
                entity.add('birthPlace', value)
        emitter.emit(entity)
Пример #15
0
def parse(context, data):
    emitter = EntityEmitter(context)
    seen = set()
    for letter in string.ascii_uppercase:
        url = URL % letter
        while True:
            context.log.info("URL: %s", url)
            res = context.http.get(url)
            doc = res.html
            for member in doc.findall('.//ul[@class="member-results"]/li'):
                parse_entry(emitter, member)

            seen.add(url)
            url = None
            for a in doc.findall('.//div[@id="pagination"]//a'):
                next_url = urljoin(URL, a.get('href'))
                if next_url not in seen:
                    url = next_url
            if url is None:
                break
    emitter.finalize()
Пример #16
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        context.log.info("LOADED XML")
        doc = res.xml
        context.log.info("PARSED XML")
        for distinct_party in doc.findall(qpath('DistinctParty')):
            parse_party(emitter, doc, distinct_party)

        for entry in doc.findall(qpath('SanctionsEntry')):
            parse_entry(emitter, doc, entry)

        for relation in doc.findall(qpath('ProfileRelationship')):
            parse_relation(emitter, doc, relation)
Пример #17
0
def officer(context, data):
    emitter = EntityEmitter(context)
    officer_id = data.get('officer_id')
    url = API_URL % officer_id
    with context.http.get(url, auth=AUTH) as res:
        if res.status_code != 200:
            return
        data = res.json
        person = emitter.make('Person')
        person.make_id(officer_id)
        source_url = urljoin(WEB_URL, data.get('links', {}).get('self', '/'))
        person.add('sourceUrl', source_url)

        last_name = data.pop('surname', None)
        person.add('lastName', last_name)
        forename = data.pop('forename', None)
        person.add('firstName', forename)
        other_forenames = data.pop('other_forenames', None)
        person.add('middleName', other_forenames)
        person.add('name', jointext(forename, other_forenames, last_name))
        person.add('title', data.pop('title', None))

        nationality = normalize_country(data.pop('nationality', None))
        person.add('nationality', nationality)
        person.add('birthDate', data.pop('date_of_birth', None))

        for disqual in data.pop('disqualifications', []):
            case = disqual.get('case_identifier')
            sanction = emitter.make('Sanction')
            sanction.make_id(person.id, case)
            sanction.add('entity', person)
            sanction.add('authority', 'UK Companies House')
            sanction.add('program', case)
            sanction.add('startDate', disqual.pop('disqualified_from', None))
            sanction.add('endDate', disqual.pop('disqualified_until', None))
            emitter.emit(sanction)

            address = disqual.pop('address', {})
            locality = address.get('locality')
            locality = jointext(locality, address.get('postal_code'))
            street = address.get('address_line_1')
            premises = address.get('premises')
            street = jointext(street, premises)
            address = jointext(street, address.get('address_line_2'),
                               locality, address.get('region'), sep=', ')
            person.add('address', address)
        emitter.emit(person)
Пример #18
0
def eeas_parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for entry in res.xml.findall('.//ENTITY'):
            parse_entry(emitter, entry)
Пример #19
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for entry in res.xml.findall('.//acount-list'):
            parse_entry(emitter, entry)
    emitter.finalize()
Пример #20
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as res:
        for node in res.xml.findall('.//mep'):
            parse_node(emitter, node)
    emitter.finalize()
Пример #21
0
def parse(context, data):
    emitter = EntityEmitter(context)
    with context.http.rehash(data) as result:
        for node in result.xml.findall('.//record'):
            parse_entry(emitter, node)