def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: with open(res.file_path, "r") as csvfile: for row in csv.DictReader(csvfile, delimiter="\t"): parse_row(emitter, row) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) entity = emitter.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") city = data.get("SUPP_CITY") address = data.get("SUPP_ADDR") start_date = clean_date(data.get("DEBAR_FROM_DATE")) entity.make_id("WBDEBAR", name, ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("address", address) entity.add("address", city) entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) sanction = emitter.make("Sanction") sanction.make_id("Sanction", entity.id) sanction.add("authority", "World Bank Debarrment") sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", start_date) sanction.add("endDate", clean_date(data.get("DEBAR_TO_DATE"))) sanction.add("sourceUrl", SOURCE) emitter.emit(entity) emitter.emit(sanction) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) entity = emitter.make('LegalEntity') name = data.get('SUPP_NAME') ent_id = data.get('SUPP_ID') reason = data.get('DEBAR_REASON') country = data.get('COUNTRY_NAME') city = data.get('SUPP_CITY') address = data.get('SUPP_ADDR') start_date = data.get('DEBAR_FROM_DATE') end_date = data.get('DEBAR_TO_DATE') entity.make_id(name, ent_id, country) names = clean_name(name) entity.add('name', names[0]) entity.add('address', address) entity.add('address', city) entity.add('country', normalize_country(country)) for name in names[1:]: entity.add('alias', name) sanction = emitter.make('Sanction') sanction.make_id('Sanction', entity.id) sanction.add('authority', 'World Bank Debarrment') sanction.add('program', reason) sanction.add('startDate', clean_date(start_date)) sanction.add('endDate', clean_date(end_date)) sanction.add('sourceUrl', SOURCE) emitter.emit(entity) emitter.emit(sanction) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) references = defaultdict(list) with context.http.rehash(data) as res: xls = xlrd.open_workbook(res.file_path) ws = xls.sheet_by_index(0) headers = [slugify(h, sep="_") for h in ws.row_values(0)] for r in range(1, ws.nrows): row = ws.row(r) row = dict(zip(headers, row)) for header, cell in row.items(): if cell.ctype == 2: row[header] = str(int(cell.value)) elif cell.ctype == 3: date = xldate_as_datetime(cell.value, xls.datemode) row[header] = date.isoformat() elif cell.ctype == 0: row[header] = None row[header] = cell.value reference = clean_reference(row.get("reference")) references[reference].append(row) for ref, rows in references.items(): parse_reference(emitter, ref, rows) emitter.finalize()
def fsf_parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for entry in res.xml.findall(".//default:sanctionEntity", NS): parse_entry(emitter, entry) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for person in res.xml.findall(".//KyrgyzPhysicPerson"): parse_person(emitter, person) for legal in res.xml.findall(".//KyrgyzLegalPerson"): parse_legal(emitter, legal) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for node in res.xml.findall('.//INDIVIDUAL'): parse_individual(emitter, node) for node in res.xml.findall('.//ENTITY'): parse_entity(emitter, node) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for row in extract_rows(res.xml): if len(row) == 5: parse_organisation(emitter, row) if len(row) == 9: parse_individual(emitter, row) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: doc = res.xml for distinct_party in doc.findall(qpath('DistinctParty')): parse_party(emitter, doc, distinct_party) for entry in doc.findall(qpath('SanctionsEntry')): parse_entry(emitter, doc, entry) for relation in doc.findall(qpath('ProfileRelationship')): parse_relation(emitter, doc, relation) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) groups = defaultdict(list) with context.http.rehash(data) as res: with open(res.file_path, 'r', encoding='iso-8859-1') as csvfile: # ignore first line next(csvfile) for row in csv.DictReader(csvfile): group = row.pop('Group ID') if group is not None: groups[group].append(row) for group, rows in groups.items(): parse_entry(emitter, group, rows) emitter.finalize()
def seco_parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: updated_at = res.xml.getroot().get("date") programs = {} for sanc in res.xml.findall(".//sanctions-program"): ssid = sanc.find("./sanctions-set").get("ssid") programs[ssid] = sanc.findtext('./program-name[@lang="eng"]') places = {} for place in res.xml.findall(".//place"): places[place.get("ssid")] = parse_address(place) for target in res.xml.findall("./target"): parse_entry(emitter, target, programs, places, updated_at) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) country = data.get('country', {}).get('code') with context.http.rehash(data) as res: persons = {} for person in res.json.get('persons', []): ep_id, ftm_id = parse_person(emitter, person, country) persons[ep_id] = ftm_id organizations = {} for organization in res.json.get('organizations', []): ep_id, ftm_id = parse_organization(emitter, organization, country) organizations[ep_id] = ftm_id for membership in res.json.get('memberships', []): parse_membership(emitter, membership, persons, organizations) emitter.finalize()
def parse_reference(emitter: EntityEmitter, row: dict): first_name = row.pop("firstname") last_name = row.pop("lastname") birth_date = row.pop("birthdate") entity = emitter.make("Person") entity.make_id("FREEZING", "{}{}{}".format(first_name, last_name, birth_date)) entity.add("status", NATURES.get(row.pop("nature"))) entity.add("birthDate", birth_date) entity.add("birthPlace", row.pop("birthplace")) entity.add("name", "{} {}".format(first_name, last_name)) entity.add("alias", row.pop("aliases")) entity.add("keywords", "ASSETS_FREEZING") emitter.emit(entity)
def officer(context, data): emitter = EntityEmitter(context) officer_id = data.get("officer_id") url = API_URL % officer_id with context.http.get(url, auth=AUTH) as res: if res.status_code != 200: context.log.info("CoH error: %r", res.json) return data = res.json # pprint(data) person = emitter.make("Person") person.make_id(officer_id) source_url = urljoin(WEB_URL, data.get("links", {}).get("self", "/")) person.add("sourceUrl", source_url) last_name = data.pop("surname", None) person.add("lastName", last_name) forename = data.pop("forename", None) person.add("firstName", forename) other_forenames = data.pop("other_forenames", None) person.add("middleName", other_forenames) person.add("name", jointext(forename, other_forenames, last_name)) person.add("title", data.pop("title", None)) person.add("nationality", data.pop("nationality", None)) person.add("birthDate", data.pop("date_of_birth", None)) person.add("topics", "crime") for disqual in data.pop("disqualifications", []): case = disqual.get("case_identifier") sanction = emitter.make("Sanction") sanction.make_id(person.id, case) sanction.add("entity", person) sanction.add("authority", "UK Companies House") sanction.add("program", case) from_date = disqual.pop("disqualified_from", None) person.context["created_at"] = from_date sanction.add("startDate", from_date) sanction.add("endDate", disqual.pop("disqualified_until", None)) emitter.emit(sanction) address = disqual.pop("address", {}) locality = address.get("locality") locality = jointext(locality, address.get("postal_code")) street = address.get("address_line_1") premises = address.get("premises") street = jointext(street, premises) address = jointext( street, address.get("address_line_2"), locality, address.get("region"), sep=", ", ) person.add("address", address) emitter.emit(person)
def parse_notice(context, data): with context.http.rehash(data) as res: res = res.json first_name = res["forename"] or "" last_name = res["name"] or "" dob = res["date_of_birth"] nationalities = res["nationalities"] place_of_birth = res["place_of_birth"] warrants = [ (warrant["charge"], warrant["issuing_country_id"]) for warrant in res["arrest_warrants"] # noqa ] gender = SEXES.get(res["sex_id"]) emitter = EntityEmitter(context) entity = emitter.make("Person") entity.make_id("INTERPOL", first_name, last_name, res["entity_id"]) entity.add("name", first_name + " " + last_name) entity.add("firstName", first_name) entity.add("lastName", last_name) entity.add("nationality", nationalities) for charge, country in warrants: entity.add("program", country) entity.add("summary", charge) entity.add("gender", gender) entity.add("birthPlace", place_of_birth) entity.add("birthDate", parse_date(dob)) entity.add("sourceUrl", res["_links"]["self"]["href"]) entity.add("keywords", "REDNOTICE") entity.add("topics", "crime") emitter.emit(entity) emitter.finalize()
def parse_notice(context, data): with context.http.rehash(data) as res: res = res.json first_name = res['forename'] or '' last_name = res['name'] or '' dob = res['date_of_birth'] nationalities = res['nationalities'] place_of_birth = res['place_of_birth'] warrants = [ (warrant['charge'], warrant['issuing_country_id']) for warrant in res['arrest_warrants'] # noqa ] gender = SEXES.get(res['sex_id']) emitter = EntityEmitter(context) entity = emitter.make('Person') entity.make_id(first_name, last_name, res['entity_id']) entity.add('name', first_name + ' ' + last_name) entity.add('firstName', first_name) entity.add('lastName', last_name) entity.add('nationality', nationalities) for charge, country in warrants: entity.add('program', country) entity.add('summary', charge) entity.add('gender', gender) entity.add('birthPlace', place_of_birth) entity.add('birthDate', parse_date(dob)) entity.add('sourceUrl', res['_links']['self']['href']) entity.add('keywords', 'REDNOTICE') entity.add('keywords', 'CRIME') emitter.emit(entity) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) url = data.get('url') country = normalize_country(data.get('country')) with context.http.rehash(data) as res: doc = res.html # updated_at = doc.findtext('.//span[@id="lastUpdateDate"]') output = doc.find('.//div[@id="countryOutput"]') if output is None: return # component = None for row in output.findall('.//li'): # next_comp = row.findtext('./td[@class="componentName"]/strong') # if next_comp is not None: # component = next_comp # continue function = element_text(row.find('.//span[@class="title"]')) if function is None: continue name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: continue person = emitter.make('Person') person.make_id(country, name, function) person.add('name', name) person.add('country', country) person.add('position', function) person.add('sourceUrl', url) emitter.emit(person) emitter.finalize()
def crawl_country(context, path, country): emitter = EntityEmitter(context) source_url = UI_URL % path context.log.info("Crawling country: %s", country) res = requests.get(DATA_URL % path) data = res.json().get("result", {}).get("data", {}).get("page", {}) blocks = data.get("acf", {}).get("blocks", [{}])[0] content = blocks.get("free_form_content", []).get("content") doc = html.fromstring(content) function = None for el in doc.getchildren(): text = el.text_content().strip() if el.tag == "h2": continue if el.tag == "h3": function = text continue name = text.replace("(Acting)", "") person = emitter.make("Person") person.make_id(source_url, name, function) person.add("name", name) person.add("country", country) person.add("position", function) person.add("sourceUrl", source_url) person.add("topics", "role.pep") # pprint(person.to_dict()) emitter.emit(person) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: doc = res.html url_file = doc.find(".//main//ul//li//a[@href]") url_file = url_file.get("href") data = requests.get(url_file).content fp = io.BytesIO(data) zfp = zipfile.ZipFile(fp, "r") info = zfp.filelist[0] fp = zfp.read(info) xls = xlrd.open_workbook(file_contents=fp) ws = xls.sheet_by_index(1) headers = [ "firstname", "lastname", "aliases", "birthdate", "birthplace", "other informations", "nature", ] for r in range(3, ws.nrows): row = ws.row(r) row = dict(zip(headers, row)) for header, cell in row.items(): if cell.ctype == 2: row[header] = str(int(cell.value)) elif cell.ctype == 3: date = xldate_as_datetime(cell.value, xls.datemode) row[header] = date.isoformat() elif cell.ctype == 0: row[header] = None else: row[header] = cell.value parse_reference(emitter, row) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: doc = remove_namespace(res.xml) context.log.info("Loading reference values...") load_ref_values(doc) context.log.info("Loading locations...") locations = load_locations(doc) context.log.info("Loading ID reg documents...") documents = load_documents(doc) for distinct_party in doc.findall(".//DistinctParty"): parse_party(emitter, doc, distinct_party, locations, documents) for entry in doc.findall(".//SanctionsEntry"): parse_entry(emitter, doc, entry) for relation in doc.findall(".//ProfileRelationship"): parse_relation(emitter, doc, relation) emitter.finalize()
def store(context, data): emitter = EntityEmitter(context) entity = model.get_proxy(data["entity"]) documentation = emitter.make("Documentation") documentation.make_id("Documentation", data["entity"]["id"], data["aleph_id"]) documentation.add("entity", entity) documentation.add("document", data["aleph_id"]) emitter.emit(documentation) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) seen = set() for letter in string.ascii_uppercase: url = URL % letter while True: context.log.info("URL: %s", url) res = context.http.get(url) doc = res.html for member in doc.findall('.//ul[@class="member-results"]/li'): parse_entry(emitter, member) seen.add(url) url = None for a in doc.findall('.//div[@id="pagination"]//a'): next_url = urljoin(URL, a.get("href")) if next_url not in seen: url = next_url if url is None: break emitter.finalize()
def officer(context, data): emitter = EntityEmitter(context) officer_id = data.get('officer_id') url = API_URL % officer_id with context.http.get(url, auth=AUTH) as res: if res.status_code != 200: return data = res.json person = emitter.make('Person') person.make_id(officer_id) source_url = urljoin(WEB_URL, data.get('links', {}).get('self', '/')) person.add('sourceUrl', source_url) last_name = data.pop('surname', None) person.add('lastName', last_name) forename = data.pop('forename', None) person.add('firstName', forename) other_forenames = data.pop('other_forenames', None) person.add('middleName', other_forenames) person.add('name', jointext(forename, other_forenames, last_name)) person.add('title', data.pop('title', None)) nationality = normalize_country(data.pop('nationality', None)) person.add('nationality', nationality) person.add('birthDate', data.pop('date_of_birth', None)) for disqual in data.pop('disqualifications', []): case = disqual.get('case_identifier') sanction = emitter.make('Sanction') sanction.make_id(person.id, case) sanction.add('entity', person) sanction.add('authority', 'UK Companies House') sanction.add('program', case) sanction.add('startDate', disqual.pop('disqualified_from', None)) sanction.add('endDate', disqual.pop('disqualified_until', None)) emitter.emit(sanction) address = disqual.pop('address', {}) locality = address.get('locality') locality = jointext(locality, address.get('postal_code')) street = address.get('address_line_1') premises = address.get('premises') street = jointext(street, premises) address = jointext(street, address.get('address_line_2'), locality, address.get('region'), sep=', ') person.add('address', address) emitter.emit(person)
def parse(context, data): emitter = EntityEmitter(context) url = data.get("url") country = data.get("country") with context.http.rehash(data) as res: doc = res.html output = doc.find('.//div[@id="countryOutput"]') if output is None: return # component = None for row in output.findall(".//li"): # next_comp = row.findtext('./td[@class="componentName"]/strong') # if next_comp is not None: # component = next_comp # continue function = element_text(row.find('.//span[@class="title"]')) if function is None: continue name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: continue person = emitter.make("Person") person.make_id(url, country, name, function) person.add("name", name) person.add("country", country) person.add("position", function) person.add("sourceUrl", url) person.add("topics", "role.pep") updated_at = doc.findtext('.//span[@id="lastUpdateDate"]') updated_at = parse_updated(updated_at) if updated_at is not None: person.add("modifiedAt", updated_at) person.context["updated_at"] = updated_at.isoformat() emitter.emit(person) emitter.finalize()
def parse_notice(context, data): with context.http.rehash(data) as res: res = res.json first_name = res["forename"] or "" last_name = res["name"] or "" dob = res["date_of_birth"] nationalities = res["nationalities"] place_of_birth = res["place_of_birth"] gender = SEXES.get(res["sex_id"]) emitter = EntityEmitter(context) entity = emitter.make("Person") entity.make_id("INTERPOL", first_name, last_name, res["entity_id"]) entity.add("name", first_name + " " + last_name) entity.add("firstName", first_name) entity.add("lastName", last_name) entity.add("nationality", nationalities) entity.add("gender", gender) entity.add("birthPlace", place_of_birth) entity.add("birthDate", parse_date(dob)) entity.add("sourceUrl", res["_links"]["self"]["href"]) entity.add("keywords", "YELLOWNOTICE") entity.add("topics", "researched") emitter.emit(entity) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) url = data.get("url") with context.http.rehash(data) as res: doc = res.html divs = doc.findall('.//div[@class="regular-details"]/div') image_link = "{}{}".format(ROOT_URL, divs[0].find(".//img").attrib["src"]) infos = {} infos["phone"] = [] for li in divs[1].findall('.//ul[@class="no-bullet"]/li'): children = li.getchildren() title = children[0] if len(children) > 1: infos[title.text.strip()[0:-1]] = [] for child in children: if child.tag == "a": infos[title.text.strip()[0:-1]].append(child.text) if child.tag == "ul": for li_in in child.findall("./li"): infos[title.text.strip()[0:-1]].append(li_in.text) if child.tag == "b": for item in title.xpath( "following-sibling::*/text()|following-sibling::text()" ): item = item.strip() if item: infos[title.text.strip()[0:-1]].append(item) if child.tag == "img": infos[title.text.strip()[0:-1]].append( "image: {}{}".format(ROOT_URL, child.attrib["src"])) elif title.tag == "b" or title.tag == "i": if title.tag == "i" and not title.attrib: infos["description"] = title.text else: for item in title.xpath( "following-sibling::*/text()|following-sibling::text()" ): item = item.strip() if item: if title.tag == "b": infos[title.text.strip()[0:-1]] = item elif title.tag == "i": phone_type = "phone" if title.attrib["class"] == "fa fa-fax": phone_type = "fax" infos["phone"].append("{}: {}".format( phone_type, item)) first_name = infos["Full name"].split(", ")[1] last_name = infos["Full name"].split(", ")[0] if "Languages" in infos: infos["Languages"] = [ info.strip() for info in infos["Languages"].split(",") ] person = emitter.make("Person") person.make_id(url, first_name, last_name) person.add("sourceUrl", url) person.add("firstName", first_name) person.add("lastName", last_name) person.add("name", infos.pop("Full name")) person.add("description", infos.get("description")) street = infos.get("Street", "") city = infos.get("City", "") postal_code = infos.get("Postal code", "") country = infos.get("Country", "") person.add("address", "{} {} {} {}".format(street, city, postal_code, country)) person.add("email", email=infos.get("Emails")) person.add("country", infos.get("Represented Country")) person.add("phone", infos.get("phone")) # TODO: make political party into an entity # TODO: don't have any left-over JSON stuff :) infos["Photo"] = image_link person.add("notes", json.dumps({key: value for key, value in infos.items()})) emitter.emit(person) emitter.finalize()
def parse(context, data): url = data["url"] response = context.http.rehash(data) html = response.html emitter = EntityEmitter(context) person = emitter.make("Person") title = _get_itemprop(html, 'http://schema.org/honorificPrefix') firstName = _get_itemprop(html, "http://schema.org/givenName") familyName = _get_itemprop(html, "http://schema.org/familyName") if not firstName or not familyName: return context.log.info("Parsing Person '" + firstName + " " + familyName + "' found at: " + url) birthDate = _extract_birth_date(_get_itemprop(html, "birthDate")) birthPlace = _get_itemprop(html, "birthPlace") telephone = _get_itemprop(html, "http://schema.org/telephone") faxNumber = _get_itemprop(html, "http://schema.org/faxNumber") image = _extract_img_src(html) email = _get_itemprop(html, "http://schema.org/email", "*") _extract_personal_websites(context, person, html) person.add("title", title) person.add("firstName", firstName) person.add("lastName", familyName) person.add("name", " ".join([firstName, familyName])) person.add("birthDate", birthDate) person.add("birthPlace", birthPlace) person.add("country", "at") _extract_social_media(html, person, context) _extract_addresses(context, html, person) person.add("phone", telephone) person.add("email", email) person.add("sourceUrl", url) person.make_id(url) _parse_info_table(emitter, context, person, html, make_mandates, "mandate") _parse_info_table(emitter, context, person, html, _make_societies, "vereine") _parse_info_table(emitter, context, person, html, _make_work_and_affiliates, "firmenfunktionen") party = _make_party(context, data, emitter, html) emitter.emit(person) if not party: emitter.finalize() return emitter.emit(party) membership = emitter.make("Membership") membership.make_id(person.id, party.id) membership.add("member", person.id) membership.add("organization", party.id) membership.add("sourceUrl", url) emitter.emit(membership) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for entry in res.xml.findall(".//acount-list"): parse_entry(emitter, entry) emitter.finalize()
def eeas_parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for entry in res.xml.findall(".//ENTITY"): parse_entry(emitter, entry) emitter.finalize()
def parse(context, data): emitter = EntityEmitter(context) with context.http.rehash(data) as res: for node in res.xml.findall('.//mep'): parse_node(emitter, node) emitter.finalize()