def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") div = doc.find('div', id="print_content") u = div.find("u").get_text().strip() name = u.split(':')[-1] year = u.split(':')[0][-4:] p = div.find_all("p")[1] text = p.get_text().strip() fields = [ {"tag": "url", "value": url}, {"tag": "text", "value": text}, {"tag": "year", "value": year} ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") div = doc.find('div', id="print_content") u = div.find("u").get_text().strip() name = u.split(':')[-1] year = u.split(':')[0][-4:] p = div.find_all("p")[1] text = p.get_text().strip() fields = [{ "tag": "url", "value": url }, { "tag": "text", "value": text }, { "tag": "year", "value": year }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") lis = doc.find_all("li", class_="card card--horizontal") for li in lis: name = get_name(li.find("span", class_="card__name").get_text()) try: mydate = get_date( li.find("span", class_="card__date").get_text()) except AttributeError: mydate = '' try: place = li.find("span", class_="card__place").get_text() except AttributeError: place = "" picture_url = urljoin(_site_url, li.find("img")["src"]) fields = [ { "name": "Date", "value": mydate }, { "name": "Place", "value": place }, { "tag": "picture_url", "value": picture_url }, ] try: entity_url = urljoin(_entity_base_url, li.find("a", class_="card__box")["href"]) doc2 = BeautifulSoup(helpers.fetch_string(entity_url), "html.parser") div = doc2.find("div", class_="article__text") ps = div.find_all("p") header = ps[0].get_text().strip() text = ' '.join([p.get_text().strip() for p in ps[1:]]) fields.append({"name": header, "value": text}) except TypeError: entity_url = '' fields.append({"tag": "url", "value": entity_url}) yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") table = doc.find('table', class_='views-table').find('tbody') trs = table.find_all('tr') for tr in trs: td = tr.find_all('td') href = td[0].find_all('a')[1]['href'] name = td[1].get_text().strip() matter_type = td[2].get_text().strip() matter_type = " ".join([word.capitalize() for word in matter_type.split()]) date_failed = td[3].get_text().strip() date_failed = "{}-{}-{}".format(date_failed[:4], date_failed[4:6], date_failed[6:]) fields = [{"name": "Matter Type", "value": matter_type}, {"name": "Docket Number", "value": href}, {"name": "Date Failed", "value": date_failed}] names = _get_name(name): if len(names) > 1: name = names[0] aka = [] for aka_name in names: aka.append({'name': aka_name}) else: name = names[0] my_id = helpers.make_id(new_name) if len(my_id) > 99: my_id = my_id[:99] if any(word in name for word in company): entity_type = "company" else: entity_type = "person" if aka: yield { "_meta": { "id": my_id, "entity_type": entity_type }, "fields": fields, "aka": aka, "name": name, } else: yield { "_meta": { "id": my_id, "entity_type": entity_type }, "fields": fields, "name": name, }
def _generate_entities(url, name, office, years_active=None, parish=None): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src']) h3 = doc.find_all('h3', class_="ms-rteElement-H3") div = h3[0].find_next_sibling() current = 'Current Posts ' + div.get_text().strip() while div.name != 'h3': div = div.find_next_sibling() if div: current += ' ' + div.get_text().strip() else: break current = current.replace('dateMember', 'date Member') fields = [ {"tag": "url", "value": url}, {"tag": "Current Posts", "value": current}, {"tag": "picture_url", "value": img}, {"tag": "Office", "value": office}, ] if years_active: fields.append({"tag": "Years Active", "value": years_active}) if parish: fields.append({"tag": "Parish", "value": parish}) try: p = h3[1].find_next_sibling() career = p.get_text().strip() while p.name == 'p': p = p.find_next_sibling() career += ' ' + p.get_text().strip() fields.append({"tag": "Parliamentary Career", "value": career}) except IndexError: pass ps = doc.find_all("p", class_="ms-rteElement-P") for p in ps: p = p.get_text().strip() if 'Born' in p: fields.append({"tag": "date_of_birth", "value": get_date(p.split(':')[-1].strip())}) elif 'Parents' in p: fields.append({"tag": "Parents", "value": p.split(':')[-1].strip()}) return { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") section = doc.find('section', id="pageContent") h1 = section.find("h1").get_text().strip() if '(' in h1: h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1] h1_without_bracket = h1_without_bracket.strip() else: h1_without_bracket = h1 names1 = h1_without_bracket.split(',') names2 = [] for name in names1: for new_name in divide_name(name, ' & '): names2.append(new_name) new_names = [] for name in names2: for new_name in divide_name(name, ' and '): new_names.append(new_name) text = section.find("p").get_text().strip() fields = [ {"tag": "url", "value": url}, {"name": "text", "value": text} ] custom_fields = section.find_all("h2") for custom_field in custom_fields: field_name = custom_field.get_text().strip() if field_name == 'Defendants': values1 = section.find_all('div', class_="chargeDefendant") values2 = section.find_all('div', class_="chargeCharge") values = zip(values1, values2) field_value = ' '.join([value[0].get_text().strip() + ' ' + value[1].get_text().strip() for value in values]) else: field_value = custom_field.find_next_sibling('p').get_text().strip() fields.append({"tag": field_name, "value": field_value}) for name in new_names: name = name.strip() yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" run = True page = 0 while run: url = _base_url + str(page) page += 1 doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") div = doc.find('div', id="resultsSearchBox") all_h3 = div.find_all("h3", id='') if not all_h3: run = False return for h3 in all_h3: a = h3.find('a') href = urljoin(_site_url, a['href']) name = a.get_text().split(':')[1].strip() sub = h3.find_next_sibling('sub') spans = sub.find_all('span') if spans: published = get_date(spans[0].get_text().strip()) modified = get_date(spans[1].get_text().strip()) else: sub = sub.get_text().strip() published = get_date(sub[11:21]) modified = get_date(sub[-10:]) if any(company in name.lower() for company in companies): entity_type = "company" else: entity_type = "person" fields = [{ "tag": "url", "value": href }, { "tag": "Published", "value": published }, { "tag": "Last Modified", "value": modified }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": entity_type }, "fields": fields, "name": name }
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i + 1].text url = data[i + 1].find_element_by_tag_name('a').get_attribute("/href") href = data[i + 2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{ "name": "Release date", "value": release_date }, { "tag": "url", "value": url }, { "name": "Related documents", "value": related_documents }] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") lis = doc.find_all("li", class_="card card--horizontal") for li in lis: name = get_name(li.find("span", class_="card__name").get_text()) try: mydate = get_date(li.find("span", class_="card__date").get_text()) except AttributeError: mydate = '' try: place = li.find("span", class_="card__place").get_text() except AttributeError: place = "" picture_url = urljoin(_site_url, li.find("img")["src"]) fields = [ {"name": "Date", "value": mydate}, {"name": "Place", "value": place}, {"tag": "picture_url", "value": picture_url}, ] try: entity_url = urljoin(_entity_base_url, li.find("a", class_="card__box")["href"]) doc2 = BeautifulSoup(helpers.fetch_string(entity_url), "html.parser") div = doc2.find("div", class_="article__text") ps = div.find_all("p") header = ps[0].get_text().strip() text = ' '.join([p.get_text().strip() for p in ps[1:]]) fields.append({"name": header, "value": text}) except TypeError: entity_url = '' fields.append({"tag": "url", "value": entity_url}) yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i+1].text url = data[i+1].find_element_by_tag_name('a').get_attribute("/href") href = data[i+2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{"name": "Release date", "value": release_date}, {"tag": "url", "value": url}, {"name": "Related documents", "value": related_documents}] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def _generate_entities(): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(_base_url[0]), "html.parser") form = doc.find('form', {'name': 'criminalqueryeng_p2'}) tables = form.find_all('table', {'bgcolor': '#84BD00'}) tr = tables[0].find_all('tr') i = 1 while i < len(tr): td = tr[i].find_all('td') name = _get_name(td[2].get_text().strip()) date_filing = _get_date(td[1].get_text().strip()) try: url = td[6].find('a')['href'] except TypeError: url = '' summarized_facts = td[5].get_text().strip() fields = [{"name": "Summarized Facts", "value": summarized_facts}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name, } i += 2 tr = tables[1].find_all('tr') i = 1 while i < len(tr): td = tr[i].find_all('td') name = _get_name(td[4].get_text().strip()) date_filing = _get_date(td[1].get_text().strip()) try: url = td[8].find('a')['href'] except TypeError: url = '' summarized_facts = td[7].get_text().strip() baht = td[9].get_text().strip() section = td[5].get_text().strip() law = td[6].get_text().strip() nomer = td[3].get_text().strip() fields = [{"name": "Summarized Facts", "value": summarized_facts}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}, {"name": "Amount of Fines (Baht)", "value": baht}, {"name": "Section", "value": section}, {"name": "Relevant Law", "value": law}, {"name": "Order Number", "value": nomer}, ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name, } i += 2 doc = BeautifulSoup(helpers.fetch_string(_base_url[1]), "html.parser") tr = doc.find_all('tr') i = 0 while i < len(tr): try: td=tr[i].find_all('td') name = _get_name(td[1].get_text().strip()) type_personal = td[2].get_text().strip() try: url = td[3].find('a')['href'] except TypeError: url = '' summarized_facts = td[4].get_text().strip() administrative_orders = td[5].get_text().strip() effective_date = td[6].get_text().strip() fields = [{"name": "Type of Personal", "value": type_personal}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}, {"name": "Administrative Orders", "value": administrative_orders}, {"name": "Summarized Facts", "value": summarized_facts}, {"name": "Effective Date", "value": effective_date}, ] my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] yield { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } i += 2 except: i += 1 doc = BeautifulSoup(helpers.fetch_string(_base_url[2]), "html.parser") tr = doc.find_all('tr') i = 0 while i < len(tr): try: td=tr[i].find_all('td') name = _get_name(td[3].get_text().strip()) sanction = _get_date(td[1].get_text().strip()) summarized_facts = td[7].get_text().strip() nomer = td[2].get_text().strip() types_business = td[4].get_text().strip() relevant_law = td[5].get_text().strip() section = td[6].get_text().strip() baht = td[10].get_text().strip() fields = [{"name": "Date of Imposing the Administrative Sanction", "value": sanction}, {"name": "Types of Business", "value": types_business}, {"name": "Summarized Facts", "value": summarized_facts}, {"name": "Order Number", "value": nomer}, {"name": "Relevant Law", "value": relevant_law}, {"name": "Section", "value": section}, {"name": "Amount of Fines (Baht)", "value": baht}, ] my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] yield { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } i += 2 except: i += 1
def _get_scrape_urls(): """find all scrapable links on main page""" doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser") council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0') house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846') council_lis = council.find_all("div", class_="link-item") house_lis = house.find_all("li", class_="dfwp-item") for li in council_lis: person = li.find("a") link = person["href"] name = _get_name(person.get_text()) office = "Legislative Council" entity = _generate_entities(link, name, office) yield entity for li in house_lis: try: parish = li.find("div", class_="groupheader").get_text().strip() except AttributeError: continue all_div = li.find_all("div", class_="link-item") for div in all_div: person = div.find("a") link = person["href"] name = _get_name(person.get_text()) office = "House of Keys" years_active = None entity = _generate_entities(link, name, office, years_active, parish) yield entity doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser") div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e') trs = div.find_all('tr', class_='ms-itmhover') for tr in trs: try: td = tr.find_all('td') name = _get_name(td[1].get_text()) office = "House of Keys" link = urljoin(_site_url, td[3].find('a')['href']) years_active = td[2].get_text().strip() try: date = int(years_active.split()[-1]) if date < 1990: continue except ValueError: pass if '.pdf' in link: fields = [{ "tag": "url", "value": link }, { "tag": "Years Active", "value": years_active }, { "tag": "Office", "value": office }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name } continue entity = _generate_entities(link, name, office, years_active) yield entity except TypeError: pass
def _generate_entities(url, name, office, years_active=None, parish=None): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src']) h3 = doc.find_all('h3', class_="ms-rteElement-H3") div = h3[0].find_next_sibling() current = 'Current Posts ' + div.get_text().strip() while div.name != 'h3': div = div.find_next_sibling() if div: current += ' ' + div.get_text().strip() else: break current = current.replace('dateMember', 'date Member') fields = [ { "tag": "url", "value": url }, { "tag": "Current Posts", "value": current }, { "tag": "picture_url", "value": img }, { "tag": "Office", "value": office }, ] if years_active: fields.append({"tag": "Years Active", "value": years_active}) if parish: fields.append({"tag": "Parish", "value": parish}) try: p = h3[1].find_next_sibling() career = p.get_text().strip() while p.name == 'p': p = p.find_next_sibling() career += ' ' + p.get_text().strip() fields.append({"tag": "Parliamentary Career", "value": career}) except IndexError: pass ps = doc.find_all("p", class_="ms-rteElement-P") for p in ps: p = p.get_text().strip() if 'Born' in p: fields.append({ "tag": "date_of_birth", "value": get_date(p.split(':')[-1].strip()) }) elif 'Parents' in p: fields.append({ "tag": "Parents", "value": p.split(':')[-1].strip() }) return { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") section = doc.find('section', id="pageContent") h1 = section.find("h1").get_text().strip() if '(' in h1: h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1] h1_without_bracket = h1_without_bracket.strip() else: h1_without_bracket = h1 names1 = h1_without_bracket.split(',') names2 = [] for name in names1: for new_name in divide_name(name, ' & '): names2.append(new_name) new_names = [] for name in names2: for new_name in divide_name(name, ' and '): new_names.append(new_name) text = section.find("p").get_text().strip() fields = [{ "tag": "url", "value": url }, { "name": "text", "value": text }] custom_fields = section.find_all("h2") for custom_field in custom_fields: field_name = custom_field.get_text().strip() if field_name == 'Defendants': values1 = section.find_all('div', class_="chargeDefendant") values2 = section.find_all('div', class_="chargeCharge") values = zip(values1, values2) field_value = ' '.join([ value[0].get_text().strip() + ' ' + value[1].get_text().strip() for value in values ]) else: field_value = custom_field.find_next_sibling( 'p').get_text().strip() fields.append({"tag": field_name, "value": field_value}) for name in new_names: name = name.strip() yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _get_scrape_urls(): """find all scrapable links on main page""" doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser") council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0') house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846') council_lis = council.find_all("div", class_="link-item") house_lis = house.find_all("li", class_="dfwp-item") for li in council_lis: person = li.find("a") link = person["href"] name = _get_name(person.get_text()) office = "Legislative Council" entity = _generate_entities(link, name, office) yield entity for li in house_lis: try: parish = li.find("div", class_="groupheader").get_text().strip() except AttributeError: continue all_div = li.find_all("div", class_="link-item") for div in all_div: person = div.find("a") link = person["href"] name = _get_name(person.get_text()) office = "House of Keys" years_active = None entity = _generate_entities(link, name, office, years_active, parish) yield entity doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser") div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e') trs = div.find_all('tr', class_='ms-itmhover') for tr in trs: try: td = tr.find_all('td') name = _get_name(td[1].get_text()) office = "House of Keys" link = urljoin(_site_url, td[3].find('a')['href']) years_active = td[2].get_text().strip() try: date = int(years_active.split()[-1]) if date < 1990: continue except ValueError: pass if '.pdf' in link: fields = [ {"tag": "url", "value": link}, {"tag": "Years Active", "value": years_active}, {"tag": "Office", "value": office} ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name } continue entity = _generate_entities(link, name, office, years_active) yield entity except TypeError: pass