def main(): main_obj = get_all_persons(_main_url) for entity in get_entities(main_obj): # helpers.check(entity) helpers.emit(entity)
def main(): for url in URLS: main_obj = get_all_persons(url) for entity in get_entities(main_obj): # helpers.check(entity) helpers.emit(entity)
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i + 1].text url = data[i + 1].find_element_by_tag_name('a').get_attribute("/href") href = data[i + 2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{ "name": "Release date", "value": release_date }, { "tag": "url", "value": url }, { "name": "Related documents", "value": related_documents }] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i+1].text url = data[i+1].find_element_by_tag_name('a').get_attribute("/href") href = data[i+2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{"name": "Release date", "value": release_date}, {"tag": "url", "value": url}, {"name": "Related documents", "value": related_documents}] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def build_document(member): role_and_name = role_name_splitter(member.select("h2 a")[0].get_text()) years_served = member.select("ul.memberServed li")[0].get_text() party = member.select("div.memberProfile table tr")[2].select("td")[0].get_text() serve_state = member.select("div.memberProfile table tr")[0].select("td")[0].get_text() entity = { "_meta": { "id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", role_and_name[1] + serve_state))).hexdigest(), "entity_type": "person" }, "name": " ".join(role_and_name[1:]).strip(), "types": ["pep"], "fields": [ {"name": "Comment", "value": "Member of US Congress"}, {"name": "Role", "value": role_and_name[0]}, {"name": "Years Served", "value": years_served}, {"name": "Political Party", "value": party}, {"name": "State", "value": serve_state} ] } helpers.emit(entity)
def build_document(member): role_and_name = role_name_splitter(member.select("h2 a")[0].get_text()) years_served = member.select("ul.memberServed li")[0].get_text() party = member.select("div.memberProfile table tr")[2].select( "td")[0].get_text() serve_state = member.select("div.memberProfile table tr")[0].select( "td")[0].get_text() entity = { "_meta": { "id": hashlib.sha224( (re.sub("[^a-zA-Z0-9]", "", role_and_name[1] + serve_state))).hexdigest(), "entity_type": "person" }, "name": " ".join(role_and_name[1:]).strip(), "types": ["pep"], "fields": [{ "name": "Comment", "value": "Member of US Congress" }, { "name": "Role", "value": role_and_name[0] }, { "name": "Years Served", "value": years_served }, { "name": "Political Party", "value": party }, { "name": "State", "value": serve_state }] } helpers.emit(entity)
def emit_meminfo(): helpers.emit( subprocess.check_output(['free', '--human'], universal_newlines=True))
def emit_meminfo(): helpers.emit(subprocess.check_output(['free', '--human'], universal_newlines=True))
def main(): main_obj = get_rows(_base_url) for entity in get_entities(main_obj): # helpers.check(entity) helpers.emit(entity)
def main(): main_obj = get_all_persons(_DOMAIN) for entity in get_entities(main_obj): helpers.emit(entity)
def main(): for entity in _get_people(_get_parties(_base_url)): # helpers.check(entity) helpers.emit(entity)
def main(): for entity in _generate_entities(): helpers.emit(entity)
def main(): main_obj = get_rows([_main_url, _second_url]) for entity in get_entities(main_obj): helpers.emit(entity)