def find_daily_links(soup, last_seen_ref): ''' Get the list of daily links, there should be the current and previous month of available links. There are older ones available in zip format by month. ''' turbotlib.log('Getting daily links') re_target_pattern = re.compile(r'.*FAC(?P<date>[0-9]{4}).HTML', re.IGNORECASE) re_year_pattern = re.compile(r'[a-z]+ ([0-9]{4})$', re.IGNORECASE) day_links = [] for table in soup.find_all('table', {'class': 'telerik-reTable-1'}): table_header = table.find_next('td') try: # Blah, a nasty hack to get the year since it looks like they reuse URLs yearly year = re.match(re_year_pattern, table_header.string).groups()[0] except AttributeError: # This must be the day of week table continue for link in table.find_all('a'): href = link.get('href') interesting_link = re.match(re_target_pattern, href) if interesting_link: date_ref = year + interesting_link.groups()[0] if date_ref > last_seen_ref: day_links.append((date_ref, href)) turbotlib.log('There are %s links to check' % len(day_links)) return day_links
def retrieve(url, method, data, attempt=1): response = None connection_exception = False headers = {"X-MicrosoftAjax": "Delta=true", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Accept": "*/*", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Cache-Control": "no-cache", "Pragma": "no-cache"} try: req = requests.Request(method, url, data=data, headers=headers) prepared = req.prepare() response = session.send(prepared) except requests.exceptions.RequestException: connection_exception = True if (connection_exception or response.status_code != requests.codes.ok) and attempt <= 5: turbotlib.log("There was a failure reaching or understanding the host, waiting and retrying...") if response is not None and response.text is not None: turbotlib.log("Failure was: " + response.text) time.sleep(attempt * 5) return retrieve(url, method, data, attempt + 1) return response
def get_soup(url, session=None): turbotlib.log('Fetching %s' % url) if not session: session = requests.Session() response = session.get(url) html = response.content return BeautifulSoup(html)
def get_soup(url, session=None): if not session: session = requests.Session() turbotlib.log('Getting soup for %s' % url) response = session.get(url) html = response.content return BeautifulSoup(html)
def get_registered_individuals(url, control_href, view_state, firm_jurisdiction, firm_name): return_array = [] turbotlib.log("Retrieving individuals for current or historical firm: " + firm_name + " in: " + firm_jurisdiction) control_id = urllib.quote(control_href.replace("javascript:__doPostBack('", '').replace("','')", '')) individuals_page_req = retrieve(url, "POST", generate_body_control(control_id, view_state)) if "Your search returned no records, please try searching again" in individuals_page_req.text: return [] num_individuals = get_record_count(individuals_page_req.text) processed_individuals = 0 last_processed_individuals = 0 ind_page = 1 while True: individuals_view_state = {'view' : urllib.quote(get_asp_resp_var(individuals_page_req.text, "__VIEWSTATE")), 'validation': urllib.quote(get_asp_resp_var(individuals_page_req.text, "__EVENTVALIDATION")), 'generator' : urllib.quote(get_asp_resp_var(individuals_page_req.text, "__VIEWSTATEGENERATOR"))} individual_links = BeautifulSoup(individuals_page_req.text).select('tr > td > a') for link in individual_links: try: if "lbtnIndDetail" not in link['href']: continue except: continue processed_individuals += 1 name = link.text.strip() individual_dict = get_individual(name, firm_jurisdiction, firm_name) if individual_dict is None: get_and_store_individuals_for_firm(link, url, individuals_view_state, name) individual_dict = get_individual(name, firm_jurisdiction, firm_name) if individual_dict is not None: return_array.append(individual_dict) else: return_array.append(individual_dict) if processed_individuals < num_individuals: if last_processed_individuals == processed_individuals: turbotlib.log('Warning: broke out of possible infinite loop trying to retrieve all individuals for firm.') break ind_page += 1 control_id = urllib.quote('ctl00$bodyContent$lbtnPager{0}'.format(ind_page)) individuals_page_req = retrieve(url, "POST", generate_body_control(control_id, individuals_view_state)) last_processed_individuals = processed_individuals else: break return return_array
def unwrap(response, identifier): try: r1 = getChunk(response, "<fragment><![CDATA[", identifier) except: turbotlib.log("Didn't find fragment") return None try: r2 = getChunk(r1, "]]></fragment>",identifier) return r2 except: turbotlib.log("Didn't find fragment") return None
def date_formatter(date): if len(date) <= 0 or date is None: return None try: time = datetime.strptime(date, "%B %d, %Y").isoformat()[:-9] if len(time) <= 1: turbotlib.log("Failure parsing date: " + date) return None return time except: turbotlib.log("Failure parsing date: " + date) return None
def run_scraper(): "Initialize and run" links = CompanyLinks( "%s%s%s" % ("https://www.og.decc.gov.uk/", "eng/fox/decc/PED301X/", "companyBlocksNav")) for i, j in enumerate(links.get_clean_links()): info = CompanyInfo(j) try: turbotlib.log("progress: %s" % i) for record in info.scrape(): print json.dumps(record) except AttributeError as e: ## Uncomment the following to see empty entries ## that cause failures #print "Fails at %s as %s" %(i, e) pass
def main(): """ Scrape licensed mortgage lenders data from extranet.dfi.in.gov """ turbotlib.log("Starting run...") # Optional debug logging source_url = 'http://extranet.dfi.in.gov/dfidb/mortgage.aspx' r = requests.get(source_url) etree = lxml.html.fromstring(r.content) column_names = get_column_names(etree) assert len(column_names) == 9, 'Number of columns has changed on site' sample_date = datetime.datetime.now().isoformat() for column_data in yield_row_data(etree): collected_data = dict(zip(column_names, column_data)) collected_data['source_url'] = source_url collected_data['sample_date'] = sample_date print json.dumps(collected_data) turbotlib.log("Run finished")
def run_scraper(): "Initialize and run" links = CompanyLinks( "%s%s%s" %( "https://www.og.decc.gov.uk/", "eng/fox/decc/PED301X/", "companyBlocksNav" ) ) for i, j in enumerate(links.get_clean_links()): info = CompanyInfo(j) try: turbotlib.log("progress: %s" % i) for record in info.scrape(): print json.dumps(record) except AttributeError as e: ## Uncomment the following to see empty entries ## that cause failures #print "Fails at %s as %s" %(i, e) pass
def retrieve(url, method, data, attempt=1): response = None connection_exception = False headers = { "X-MicrosoftAjax": "Delta=true", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Accept": "*/*", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Cache-Control": "no-cache", "Pragma": "no-cache" } try: req = requests.Request(method, url, data=data, headers=headers) prepared = req.prepare() response = session.send(prepared) except requests.exceptions.RequestException: connection_exception = True if (connection_exception or response.status_code != requests.codes.ok) and attempt <= 5: turbotlib.log( "There was a failure reaching or understanding the host, waiting and retrying..." ) if response is not None and response.text is not None: turbotlib.log("Failure was: " + response.text) time.sleep(attempt * 5) return retrieve(url, method, data, attempt + 1) return response
def parse_table(url): turbotlib.log("Parse " + url) # Load the page doc = BeautifulSoup(requests.get(HOST + url).content) # Find the right table for the data and the one for the links target_main = None for main in doc.find_all('td', class_='maincontent'): if len(main.find_all('table')): target_main = main target_tables = target_main.find_all('table') target_table = target_tables[0] links_table = target_tables[1] # Parse the data, skip header and footer for tr in target_table.find_all('tr')[1:-3]: tds = tr.find_all('td') record = { 'Reg_and_C_of_R_Number': clean_up(tds[0].text), 'Date_of_Issue_of_C_of_R': clean_up(tds[1].text), 'Name_of_Owner_and_Address' : clean_up(tds[2].text), 'type_and_serial_number' : clean_up(tds[2].text), 'year_of_manufacture' : clean_up(tds[3].text), 'all_up_mass_LBS_KGSS' : clean_up(tds[4].text), 'category' : clean_up(tds[5].text), 'engine_type' : clean_up(tds[6].text), 'certificate_expiry_date' : clean_up(tds[7].text), 'sample_date': datetime.datetime.now().isoformat(), 'source_url': TARGET } print (json.dumps(record)) # Find links for td in links_table.find_all('td'): if clean_up(td.text) == 'Next': next_page = td.find('a', href=True)['href'] parse_table(next_page)
def parse_table(url): turbotlib.log("Parse " + url) # Load the page doc = BeautifulSoup(requests.get(HOST + url).content) # Find the right table for the data and the one for the links target_main = None for main in doc.find_all('td', class_='maincontent'): if len(main.find_all('table')): target_main = main target_tables = target_main.find_all('table') target_table = target_tables[0] links_table = target_tables[1] # Parse the data, skip header and footer for tr in target_table.find_all('tr')[1:-3]: tds = tr.find_all('td') record = { 'Reg_and_C_of_R_Number': clean_up(tds[0].text), 'Date_of_Issue_of_C_of_R': clean_up(tds[1].text), 'Name_of_Owner_and_Address': clean_up(tds[2].text), 'type_and_serial_number': clean_up(tds[2].text), 'year_of_manufacture': clean_up(tds[3].text), 'all_up_mass_LBS_KGSS': clean_up(tds[4].text), 'category': clean_up(tds[5].text), 'engine_type': clean_up(tds[6].text), 'certificate_expiry_date': clean_up(tds[7].text), 'sample_date': datetime.datetime.now().isoformat(), 'source_url': TARGET } print(json.dumps(record)) # Find links for td in links_table.find_all('td'): if clean_up(td.text) == 'Next': next_page = td.find('a', href=True)['href'] parse_table(next_page)
def process_pages(url): # Attempt to resume if we can try: page_number = turbotlib.get_var("page") record_count = turbotlib.get_var("check_count") except KeyError: page_number = 1 record_count = None if page_number > 1: turbotlib.log("Resuming run from page {0}".format(page_number)) with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump: for record in dump: print record dump.close() # iterate over whole or remaining data set while record_count is None or (page_number * 100 - 100) < record_count: turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100))) # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp # However, not a problem and subsequent calls work as expected. response_text = process_page(url, 2 if page_number == 1 else page_number) # Ensure the number of records haven't changed during run check_count = get_record_count(response_text) turbotlib.save_var("check_count", check_count) if record_count is not None and record_count != check_count: reset_state() raise Exception( "The data set changed during parsing, we need a re-run.") else: record_count = check_count if not record_count > 0: raise Exception("The data set is empty.") page_number += 1 turbotlib.save_var("page", page_number) turbotlib.log("Run finished!") reset_state()
def process_pages(url): # Attempt to resume if we can try: page_number = turbotlib.get_var("page") record_count = turbotlib.get_var("check_count") except KeyError: page_number = 1 record_count = None if page_number > 1: turbotlib.log("Resuming run from page {0}".format(page_number)) with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump: for record in dump: print record dump.close() # iterate over whole or remaining data set while record_count is None or (page_number * 100 - 100) < record_count: turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100))) # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp # However, not a problem and subsequent calls work as expected. response_text = process_page(url, 2 if page_number == 1 else page_number) # Ensure the number of records haven't changed during run check_count = get_record_count(response_text) turbotlib.save_var("check_count", check_count) if record_count is not None and record_count != check_count: reset_state() raise Exception("The data set changed during parsing, we need a re-run.") else: record_count = check_count if not record_count > 0: raise Exception("The data set is empty.") page_number += 1 turbotlib.save_var("page", page_number) turbotlib.log("Run finished!") reset_state()
{'type': 'Exchange broker', 'value': 'corredores'} ] #FUNCTIONS #retrieve a document at a given URL as parsed html tree def get_doc(source_url, extra_parameters={}): post_value = {"Pagina": "1"} #need to override post values on certain pages to avoid automatic redirection post_value.update(extra_parameters) response = requests.post(source_url, post_value) html = response.content doc = BeautifulSoup(html) return doc #get going sample_date = unicode(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #**** #SECTION TWO: FINANCIAL INSTITUTIONS - COLLATE LIST OF ENTITIES AND THEN PROCESS DETAILS OF EACH ENTITY turbotlib.log("") turbotlib.log("**** FINANCIAL INSTITUTIONS ****") turbotlib.log("") #scrape list of financial institutions to look at financial_institutions = [] #list to store the ones we find for list_url in institution_urls: turbotlib.log("Loading list of " + list_url['type_of_institution'] + "s") # try: financial_institution_list_page = get_doc(list_url['url']) financial_institution_list = financial_institution_list_page.find("table", attrs={"class": "Tabla_Borde"})
# -*- coding: utf-8 -*- import json import datetime import turbotlib import requests import lxml.html BASE_URL = "http://license.reg.state.ma.us/public/licque.asp?query=business&color=&board=" SEARCH_URL = "http://license.reg.state.ma.us/public/pubLicRange.asp?profession=%s&busname=_&buscity=&querytype=business" URL_BASE = "http://license.reg.state.ma.us/public/" turbotlib.log("Starting run...") # Start a requests session s = requests.session() def get_business_types(url): """Gets the available business types in the search form. Returns a list""" response = s.get(url) root = lxml.html.fromstring(response.text) options = root.xpath("//select[@name='profession']/option") return [option.text.strip() for option in options] def parse_business_licenses(html): root = lxml.html.fromstring(html) trs = root.xpath("//table[@id='tableresults']/tbody/tr") for tr in trs:
# -*- coding: utf-8 -*- import json import datetime import turbotlib import requests import re from bs4 import BeautifulSoup turbotlib.log("Starting run...") # Optional debug logging HOST = "http://www.tcaa.go.tz/" TARGET = "aircraft_register.php" def clean_up(string): return re.sub(' +', ' ', string.replace('\r\n', ' ')).strip() def parse_table(url): turbotlib.log("Parse " + url) # Load the page doc = BeautifulSoup(requests.get(HOST + url).content) # Find the right table for the data and the one for the links target_main = None for main in doc.find_all('td', class_='maincontent'): if len(main.find_all('table')): target_main = main target_tables = target_main.find_all('table')
# -*- coding: utf-8 -*- import codecs import datetime import json import requests import turbotlib from bs4 import BeautifulSoup turbotlib.log('Starting run...') def get_soup(url, session=None): if not session: session = requests.Session() turbotlib.log('Getting soup for %s' % url) response = session.get(url) html = response.content return BeautifulSoup(html) session = requests.Session() sample_date = str(datetime.date.today()) base_url ='http://www.knf.gov.pl' # This is the first page in about 20~ paginated tables # Obscure the url from search engines since code may end up on github target_url = codecs.decode('uggc://jjj.xas.tbi.cy/cbqzvbgl/svaqVaQrgnvy.npgvba?pglcr=Onaxv+fc%P3%O3%P5%82qmvrypmr&nwnk=gehr&enaqbz=0.20882477751001716&co.fgneg=0', 'rot_13') while target_url: soup = get_soup(target_url, session) for tr in soup.find_all('tr')[1:]: data = {
td_index += 1 if (len(item) > 0): items.append(item) except: pass return items #urls to use base_href = "http://www.cnvmr.ro/asf/registru/" front_url = base_href + "lista.php?listasect=1&lng=2" #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #Step 1: extract list of categories from front page try: categories = [] #store the list as we find them front_page = get_doc(front_url) category_list = front_page.find("table", id="listaEntitati") category_rows = category_list.find_all("tr") current_category = None #maintain link to current category for row in category_rows: td_list = row.find_all("td") #deal only with non-empty rows if (len(td_list) > 0):
#record anything with a non-blank value td_text = td.text.strip().replace("\n", " ").replace("\t", "") if (len(td_text) > 0): td_index = td_list.index(td) header = headers[td_index] record[header] = td_text.strip().replace("\n", " ").replace("\t", "") #check we found something if (len(record) > 3): word_records.append(record) return word_records #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #first load the excel files for source in sources: turbotlib.log("Loading file " + str(sources.index(source) + 1) + "/" + str(len(sources))) source_page = get_doc(source['url']) source_links = source_page.find_all("a") for link in source_links: #find the link on the page which leads to the right if (source['file'] == "excel"): if ('.xls' in link['href']): #can't use formatting information for more recent file formats in xlrd if (('.xlsx' in link['href']) or ('.xlsb' in link['href'])): formatting = False else: formatting = True
def parse_entity(entity_id): turbotlib.log("Parsing entity " + entity_id) try: entity_page = get_doc(detail_url + entity_id) #create object to store output output = { 'sample_date': sample_date, 'source_url': detail_url + entity_id, 'source': "Securities Commission, Malaysia" } added_info = False #now get general info name = entity_page.find(id="StdPageLayout1_lblLicenceHolder").text.strip() if (len(name) > 0): output['name'] = name added_info = True licence_number = entity_page.find(id="StdPageLayout1_lblLicenceNo").text.strip() if (len(licence_number) > 0): output['licence_number'] = licence_number added_info = True regulated_activity_list = entity_page.find(id="StdPageLayout1_lblRegulatedAct") regulated_activity = [] for item in regulated_activity_list.stripped_strings: activity = item.replace("  -", "").strip() regulated_activity.append(activity) if (len(regulated_activity) > 0): output['regulated_activities'] = regulated_activity added_info = True start_date = entity_page.find(id="StdPageLayout1_lblLicenceSince").text.strip() if (len(start_date) > 0): output['start_date'] = start_date added_info = True anniversary_date = entity_page.find(id="StdPageLayout1_lblAnniversaryDate").text.strip() if (len(anniversary_date) > 0): output['anniversary_date'] = anniversary_date added_info = True status = entity_page.find(id="StdPageLayout1_lblStatus").text.strip() if (len(status) > 0): output['status'] = status added_info = True licensed_reps = entity_page.find(id="StdPageLayout1_lblNoOfLicenceRep").text.strip() if (len(licensed_reps) > 0): output['number_of_licensed_representatives'] = licensed_reps added_info = True #then go through specific tabs. first up, is licences licences = [] licence_table = entity_page.find(id="tabs-1").table licence_rows = licence_table.find_all("tr") for tr in licence_rows[1:]: td_list = tr.find_all("td") licence = {} #make an object then see if we need to add it to the list #first off, licence number number = td_list[0].text.strip() if (len(number) > 0): licence['number'] = number #second cell: list of activities activities = [] for item in td_list[1].stripped_strings: activity_string = item.replace("  -", "").replace(u"•", "").strip() if (len(item) > 0): activities.append(activity_string) if (len(activities) > 0): licence['activities'] = activities #third cell: anniversary date anniversary_date = td_list[2].text.strip() if (len(anniversary_date) > 0): licence['anniversary_date'] = anniversary_date #fourth cell: status status = td_list[3].text.strip() if (len(status) > 0): licence['status'] = status #now add to the result if (len(licence) > 0): licences.append(licence) #now append to list if (len(licences) > 0): output['licences'] = licences added_info = True #second tab: Associate persons associates = [] associates_table = entity_page.find(id="tabs-3").table associates_rows = associates_table.find_all("tr") for tr in associates_rows[1:]: associate = {} td_list = tr.find_all("td") #first cell = name, second cell = designation, third cell = sub-designation name = td_list[0].text.strip() if (len(name) > 0): associate['name'] = name designation = td_list[1].text.strip() if (len(designation) > 0): associate['designation'] = designation sub_designation = td_list[2].text.strip() if (len(sub_designation) > 0): associate['subdesignation'] = sub_designation if (len(associate) > 0): associates.append(associate) if (len(associates) > 0): output['associate_persons'] = associates added_info = True #third tab: business address address_table = entity_page.find(id="tabs-4").table address_rows = address_table.find_all("tr") for tr in address_rows[1:]: td_list = tr.find_all("td") label = td_list[0].text.strip().lower().replace(" ", "_") #address is different - need to extract it line-by-line if (label == "address"): address_lines = [] for line in td_list[1].stripped_strings: address_lines.append(line) address = ", ".join(address_lines) if (len(address) > 0): output['address'] = address added_info = True else: value = td_list[1].text.strip() if ((len(value) > 0) and (len(label) > 0)): output[label] = value added_info = True #fourth tab: name changes name_change_table = entity_page.find(id="tabs-5").table name_change_rows = name_change_table.find_all("tr") name_changes = [] for tr in name_change_rows[1:]: td_list = tr.find_all("td") if (len(td_list) == 2): effective_date = td_list[0].text.strip() previous_name = td_list[1].text.strip() if ((len(previous_name) > 0) and (len(effective_date) > 0)): name_change = { 'previous_name': previous_name, 'effective_date': effective_date } name_changes.append(name_change) if (len(name_changes) > 0): output['previous_names'] = name_changes added_info = True #fifth tab: licensed reps reps_table = entity_page.find(id="tabs-6").table reps_rows = reps_table.find_all("tr") reps = [] for tr in reps_rows[1:]: rep = {} td_list = tr.find_all("td") name = td_list[0].text.strip() if (len(name) > 0): rep['name'] = name licence_number = td_list[1].text.strip() if (len(licence_number) > 0): rep['licence_number'] = licence_number #extra info in hyperlink for name if (td_list[0].a != None): href = td_list[0].a['href'] licence_id_start = href.find("=") licence_id_end = href.find("&") licence_id = href[licence_id_start + 1: licence_id_end] if (len(licence_id) > 0): rep['licence_id'] = licence_id rep['detail_url'] = "http://ers.seccom.com.my/public/LicenceGeneralInfo.aspx?LicenceID=" + licence_id if (len(rep) > 0): reps.append(rep) if (len(reps) > 0): output['licensed_representatives'] = reps added_info = True #got to the end, save the results if (added_info): print(json.dumps(output)) except: pass
def openPage(browser, visit, controls, identifier, parse): browser.form.set_all_readonly(False) for cname in controls: c = controls[cname] if c is None: for control in browser.form.controls: if control.name == cname: browser.form.controls.remove(control) for cname in controls: c = controls[cname] if c is not None: exists = True try: find_ctrl = browser.form.find_control(cname) except mechanize._form.ControlNotFoundError: exists = False if exists is True and cname.find('bnConnectionTemplate:r1:0:s11:selectedStatuses') is -1: #find_ctrl.readonly = False try: #browser.form[cname] = c find_ctrl.value = c except Exception as e: #print e try: #print find_ctrl except: turbotlib.log("did not find form input " + cname) #turbotlib.log("did not find form input " + cname) try: #print find_ctrl except: turbotlib.log("did not find form input " + cname) else: if cname.find('bnConnectionTemplate:r1:0:s11:selectedStatuses') is not -1: browser.form.new_control('hidden','bnConnectionTemplate:r1:0:s11:selectedStatuses', {'value': c, 'checked': True}) else: browser.form.new_control('hidden',cname, {'value': c}) browser.form.fixup() turbotlib.log("Starting Request...") try: if visit is True: response = browser.open(browser.form.click(), timeout = 60) #print urllib.unquote(browser.request.get_data()).decode('utf8') else: response = browser.open_novisit(browser.form.click(), timeout = 60) response_content = response.read() #print response_content except: turbotlib.log("Bad Request. Starting over " + bankType + " Category") return False response.close() turbotlib.log("Response Received...") if parse is True: html = unwrap(response_content, identifier) if html is not None: result = parseResult(html) return result else: #print response_content return None return None
# -*- coding: utf-8 -*- import turbotlib from scrape_banks import scrape_banks from scrape_foreign import scrape_foreign from scrape_imf import scrape_imf from scrape_revoked import scrape_revoked turbotlib.log("Starting run") turbotlib.log("\nScraping banks :") scrape_banks() turbotlib.log("\nScraping IMFs :") scrape_imf() turbotlib.log("\nScraping foreign banks :") scrape_foreign() turbotlib.log("\nScraping revoked banks :") scrape_revoked()
#make an object and add to the list result = { 'name': item_name, 'idx': item_idx } results.append(result) except: continue return results #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #Base URLs we will want to use category_url = "http://www.fi.se/Folder-EN/Startpage/Register/Company-register/Company-register-Company-per-category/?typ='" #need to finish with category and "'" company_url = "http://www.fi.se/Folder-EN/Startpage/Register/Company-register/Company-register-Details/?idx=" #need to finish with idx number company_se_url = "http://www.fi.se/Register/Foretagsregistret/Foretagsregistret-Detaljerad-information/?idx=" #need to finish with idx number overseas_permissions_url = "http://www.fi.se/Register/Foretagsregistret/Foretagsregistret-Gransoverskridande-handel/?idx=" #need to finish with idx number #keep track of progress count = 1 #These are the categories of instituion - use these to populate category urls and find the relevant entries categories = [ "BANK++", #Banking companies (limited liability company) "FILB++", #Foreign branches of Swedish chartered banks "MBANK+", #Members-bank
{'url': "http://www.garfin.org/agents.html", 'category': "Insurance agent"}, {'url': "http://www.garfin.org/salespersons.html", 'category': "Insurance salesperson"}, {'url': "http://www.garfin.org/moneyservices.html", 'category': "Money services"} ] #FUNCTIONS #retrieve a document at a given URL as parsed html tree def get_doc(source_url): response = requests.get(source_url) html = response.content doc = BeautifulSoup(html) return doc #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #go through the pages one by one for source in source_urls: #monitor progress count = source_urls.index(source) + 1 turbotlib.log("Parsing category " + str(count) + "/" + str(len(source_urls))) #load page source_page = get_doc(source['url']) entities = source_page.find(attrs={"class": "deptContent"}).ol.find_all("li") #now go through the names one by one for entity in entities: #make object to store result temporarily output = {
{'url': "http://www.nbs.rs/static/nbs_site/gen/english/60/60b4_en.htm", 'category': "Agency or outlet whose operating licence has been revoked", 'basehref': None}, {'url': "http://www.nbs.rs/static/nbs_site/gen/english/60/tagencije.htm", 'category': "Legal entity in charge of insurance agency and brokerage pursuant to special law", 'basehref': None} ] #translation countries = { 'MOSKVA': "Russia", 'PODGORICA': "Montenegro", 'SOUTH DAKOTA': "South Dakota", 'FRANKFURT/MAIN': "Germany", 'SKOPJE': "Republic of Macedonia" } #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #Step 1: load page for each category to identify who needs looking at for entity_list in entity_lists: #monitor progress list_count = entity_lists.index(entity_list) + 1 turbotlib.log("Starting category " + str(list_count) + "/" + str(len(entity_lists))) #load page list_page = get_doc(entity_list['url']) list_table = list_page.table #first off, if this category doesn't have links on its front page, there's nothing more to do if (entity_list['basehref'] == None): #deal with one weird template first if (entity_list['category'] == "Legal entity in charge of insurance agency and brokerage pursuant to special law"):
return name.strip() def parse_governors(cell): governors = dict() for p in cell("p"): title_and_name = detag(p).split(':', 1) print(title_and_name) governors[title_and_name[0].strip()] = title_and_name[1].strip() return governors # The printer-friendly version of the page is much easier to parse source_url = "http://nbt.tj/en/banking_system/credit_org.php?print=Y" sample_date = str(date.today()) turbotlib.log("Starting scrape...") # Optional debug logging response = requests.get(source_url) html = response.content doc = BeautifulSoup(html, "lxml") tables = [table for table in doc.table.table("table")] rows = [tr for table in tables for tr in table("tr")][1:] # skip the header institution_type = "Bank" for row in rows: cells = row("td") if len(cells) is 2: institution_type = cells[1].find(BOLD_RE).string continue # cells[0] just contains a line number, skipping
# -*- coding: utf-8 -*- import codecs import datetime import json import re import requests import turbotlib turbotlib.log('Starting run...') # Obscure the url from search engines since code may end up on github source_url = codecs.decode('uggc://jjj.nre.pn/qngn/pbajryy/PbaJryy.gkg', 'rot_13') response = requests.get(source_url, timeout=20) # So this is a plain text file, not delineated at all, luckily they have # a semi header which is a bunch of equal signs split apart by two columns column_starts = [] column_names = ( 'Well Location', 'Licence Number', 'Licensee Code and Name', 'Confidential Type', 'Conf. Below Frmtn', 'Conf. Release Date', ) if response.status_code == 200: turbotlib.log('Valid response received...') for line in response.iter_lines():
# -*- coding: utf-8 -*- import json import datetime import re import turbotlib from bs4 import BeautifulSoup import requests turbotlib.log("Starting run...") class Entry(object): def __init__(self): self.sample_date = str(datetime.date.today()) self.source_url = '' self.name = '' self.tel = '' self.fax = '' self.telex = '' self.swift = '' self.box = '' self.reuters = '' self.url = '' self.ceo_tel = '' self.ceo_fax = '' self.country = '' patterns = (
if (len(rep) > 0): reps.append(rep) if (len(reps) > 0): output['licensed_representatives'] = reps added_info = True #got to the end, save the results if (added_info): print(json.dumps(output)) except: pass #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #Step 1: make a list of letter pairs to iterate through, and work out which entities are in the database entities = [] #store results so we don't do them twice letter_pairs = [] #store what we're iterating over for first_letter in string.lowercase: for second_letter in string.lowercase: letter_pair = first_letter + second_letter turbotlib.log("Now searching for pair '" + letter_pair + "'") try: search_page = get_doc(search_url + letter_pair) rows = search_page.find_all(attrs={"class": "stdrow"}) + search_page.find_all(attrs={"class": "stdrow-1"}) for row in rows: td_list = row.find_all("td") for td in td_list:
def parse_page(layout, config=None): xset, yset = set(), set() tlines = [] objstack = list(reversed(layout._objs)) while objstack: b = objstack.pop() if type(b) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]: objstack.extend(reversed( b._objs)) # put contents of aggregate object into stack elif type(b) == LTTextLineHorizontal: tlines.append(b) elif type(b) in [LTLine]: if b.x0 == b.x1: xset.add(b.x0) elif b.y0 == b.y1: yset.add(b.y0) else: print "sloped line", b elif type(b) in [LTRect]: if b.x1 - b.x0 < 2.0: xset.add(b.y0) else: yset.add(b.x0) elif type(b) == LTImage: continue else: turbotlib.log('Unregognized type: %s' % type(b)) xlist = sorted(list(xset)) ylist = sorted(list(yset)) # initialize the output array of table text boxes boxes = [[[] for xl in xlist] for yl in ylist] for lt in tlines: y = (lt.y0 + lt.y1) / 2 iy = Wposition(ylist, y) previx = None for lct in lt: if type(lct) == LTAnno: continue # a junk element in LTTextLineHorizontal x = (lct.x0 + lct.x1) / 2 ix = Wposition(xlist, x) if previx != ix: boxes[iy][ix].append([]) # begin new chain of characters previx = ix boxes[iy][ix][-1].append(lct.get_text()) for iy in range(len(ylist)): for ix in range(len(xlist)): boxes[iy][ix] = ["".join(s) for s in boxes[iy][ix]] if 'remove' in config: del boxes[config['remove']:] headers = ["".join(lh.strip() for lh in h).strip() for h in boxes.pop()] try: assert headers == config['headers'] except AssertionError: turbotlib.log('Headers: %s' % headers) turbotlib.log('Headers (config): %s' % config['headers']) # merge entries where needed if config['merge']: name_column_index = headers.index(config['name_column_name']) unique_column_index = headers.index(config['unique_column_name']) for i, entry in enumerate(boxes): if headers[name_column_index + 1] == '' and entry[name_column_index + 1]: boxes[i][name_column_index][1:1] = boxes[i][name_column_index + 1] for i, entry in enumerate(boxes): if (len(entry[unique_column_index]) == 0 or entry[unique_column_index][0].strip() == '') and boxes[i + 1]: # if headers[name_column_index+1] == '' and boxes[i+1][name_column_index+1]: # boxes[i+1][name_column_index].extend(boxes[i+1][name_column_index+1]) if len( entry[name_column_index] ) > 0 and entry[name_column_index][0] != config['total_title']: boxes[i + 1][name_column_index].extend( entry[name_column_index]) for idx in config['merge_indexes']: if len(entry[idx]) > 0: boxes[i + 1][idx].extend(entry[idx]) box_list = [] for row in boxes: if (row[0] != ''): box_list.append(dict(zip(headers, ["".join(s) for s in row]))) return box_list
turbotlib.log("Response Received...") if parse is True: html = unwrap(response_content, identifier) if html is not None: result = parseResult(html) return result else: #print response_content return None return None ########################### turbotlib.log("Starting run...") # Optional debug logging country = "Australia" # First, get the list of categories from the form search dropdown user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent, 'Accept' : "*/*" } start_url = "https://connectonline.asic.gov.au/RegistrySearch/faces/landing/ProfessionalRegisters.jspx" url_domain = 'https://connectonline.asic.gov.au' browser = mechanize.Browser() browser.set_handle_robots(False) # ignore robots
# -*- coding: utf-8 -*- import json import datetime import turbotlib turbotlib.log("Starting run...") # Optional debug logging for n in range(0,20): data = {"number": n, "message": "Hello %s" % n, "sample_date": datetime.datetime.now().isoformat(), "source_url": "http://somewhere.com/%s" % n} # The Turbot specification simply requires us to output lines of JSON print json.dumps(data)
def parse_excel(excel_content, formatting, source_url, category): #load in the document document = xlrd.open_workbook(file_contents=excel_content, formatting_info=formatting) excel_records = [] for sheet_num in xrange(0, document.nsheets): sheet = document.sheet_by_index(sheet_num) turbotlib.log("Processing sheet " + str(sheet_num + 1) + "/" + str(document.nsheets)) # skip sheet if top-left is blank check_cells = [] for row in xrange(0, min(10, sheet.nrows)): for col in xrange(0, min(10, sheet.ncols)): check_cell = unicode(sheet.cell_value(row, col)).strip() if (len(check_cell) > 0): check_cells.append(check_cell) if (len(check_cells) == 0): continue #find the start of the headers - where there is something in column A and column B header_start_row = -1 for row in xrange(0,sheet.nrows): if (len(sheet.cell_value(row, 0)) > 0): if (len(sheet.cell_value(row, 1)) > 0): header_start_row = row break #try to find out the end of the headers using formatting information. however, this won't work for xlsx or xlsb files due to xlrd's limitations header_end_row = -1 if (formatting): #find the end of the headers - first attempt is where the background colour changes header_xf_index = sheet.cell_xf_index(header_start_row, 0) header_xf = document.xf_list[header_xf_index] header_background = header_xf.background.background_colour_index #go through all subsequent rows until we find one with a different background colour for row in xrange(header_start_row, sheet.nrows): row_xf_index = sheet.cell_xf_index(row, 0) row_xf = document.xf_list[row_xf_index] row_background = row_xf.background.background_colour_index if (row_background != header_background): header_end_row = row - 1 break #second attempt - if that didn't work, find out where it changed from bold to unbold if (header_end_row == -1): for row in xrange(header_start_row, sheet.nrows): row_xf_index = sheet.cell_xf_index(row, 0) row_xf = document.xf_list[row_xf_index] row_font_index = row_xf.font_index row_font = document.font_list[row_font_index] if (row_font.weight == 400): #standard weight is 400 for normal, 700 for bold header_end_row = row - 1 break #otherwise have to try to infer it from other factors- first use of number '1' in column A else: for row in xrange(header_start_row, sheet.nrows): if (sheet.cell_value(row, 0) == 1): header_end_row = row - 1 #now work out how many columns we have in the final headers row - look for column with no values in the header rows header_end_col = -1 for col in xrange(0, sheet.ncols): #check if all header rows in this column are blank (which they will be for merged cells, except for the first column) cell_contents = [] for row in xrange(header_start_row, header_end_row + 1): cell_content = unicode(sheet.cell_value(row, col)).strip() #if we have formatting information and a blank cell, check for merged cells if (formatting and (len(cell_content) == 0)): for merge_range in sheet.merged_cells: r_low, r_high, c_low, c_high = merge_range #check if our cell is in the middle of a merged row if ((row >= r_low) and (row <= r_high)): if ((col >= c_low) and (col <= c_high)): cell_content = unicode(sheet.cell_value(r_low, c_low)).strip() #add our result to the list cell_contents.append(cell_content) #check if all are empty/blank empty_count = 0 for cell in cell_contents: if (len(cell) == 0): empty_count += 1 if (empty_count == len(cell_contents)): header_end_col = col - 1 break #fallback - if didn't find an end, then use ncols if (header_end_col == -1): header_end_col = sheet.ncols - 1 #now we know where the headers are - time to find the end of the data data_start_row = header_end_row + 1 #starts one after the headers, unsurprisingly data_end_row = -1 for row in xrange(data_start_row, sheet.nrows): #go through all columns - first row where they're all blank means you've reached the end row_contents = [] for col in xrange(0, header_end_col): cell_content = unicode(sheet.cell_value(row, col)).strip() row_contents.append(cell_content) empty_count = 0 for cell in row_contents: if (len(cell) == 0): empty_count += 1 if (empty_count == len(row_contents)): data_end_row = row - 1 break if (data_end_row == -1): data_end_row = sheet.nrows - 1 #extract the headers - taking account of merged cells headers = [] for col in xrange(0, header_end_col): #combine all headers in a column into one string header_cells = [] for row in xrange(header_start_row, header_end_row + 1): check_col = col header_string = "" while ((len(header_string) == 0) and (check_col > 0)): header_string = unicode(sheet.cell_value(row, check_col)).strip() check_col -= 1 #go back a column to get the value of a merged cell if (len(header_string) > 0): header_cells.append(header_string) #cope with merged cells at the end - just take the main value if ((len(header_cells) > 0) and (header_cells[0] == header_cells[-1])): header = header_cells[0] else: if (category == "Banking operation"): end = -1 else: end = len(header_cells) header = " - ".join(header_cells[:end]) headers.append(header) #now get the data for row in xrange(data_start_row, data_end_row + 1): #one record per row - with metadata record = { 'sample_date': sample_date, 'source_url': source_url, 'source_sheet': sheet.name, 'category': category } #load in value for each column for col in xrange(0, header_end_col): label = headers[col].replace("\n", " ").replace("\t", "").replace(" ", "") if (len(label) == 0): label = "id" if (label == "Name"): label = "name" value = unicode(sheet.cell_value(row, col)).strip().replace("\n", " ").replace("\t", "").replace(" ", "") if (len(value) > 0): record[label] = value excel_records.append(record) #spit it out at the end return excel_records
{'url': "http://asfro.ro/em/ra/registru_en.php?reg=as", 'country': "Romania", 'category': "Section A - Insurance undertakings"}, {'url': "http://asfro.ro/em/ra/registru_en.php?reg=ab", 'country': "Romania", 'category': "Section A - Intermediaries"}, {'url': "http://asfro.ro/em/ra/registru_en.php?reg=bs", 'country': "Romania", 'category': "Section B - Insurance undertakings"}, {'url': "http://asfro.ro/em/ra/registru_en.php?reg=bb", 'country': "Romania", 'category': "Section B - Intermediaries"}, {'url': "http://asfro.ro/em/cs/cautare_en.php?tip=s&mod=d", 'country': "Overseas", 'category': "Insurance undertakings and intermediaries from EEA"} ] #different pages needed depending on registry used detail_urls = { 'Romania': "http://asfro.ro/em/ra/detalii_en.php?cod=", 'Overseas': "http://asfro.ro/em/cs/detalii_en.php?cod=" } #get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging #Step 1: load index page and see how many pages there are in this category for category in entity_lists: try: turbotlib.log("Parsing category " + category['category']) category_page = get_doc(category['url']) #identify the relevant links and then see which has the highest number link_list = category_page.find_all("a") highest_link = 1 for link in link_list: if (link['href'][0] == "?"): #these ones start with a question mark link_text = link.string.strip() if (link_text.isnumeric()): page_number = int(link_text)
turbotlib.log('There are %s links to check' % len(day_links)) return day_links def get_soup(url, session=None): turbotlib.log('Fetching %s' % url) if not session: session = requests.Session() response = session.get(url) html = response.content return BeautifulSoup(html) try: last_seen_ref = turbotlib.get_var('last_seen_ref') turbotlib.log('last_seen_ref: ' + last_seen_ref) except KeyError: turbotlib.log('Unknown last_seen_ref, start from the beginning') last_seen_ref = '00000000' source_url = codecs.decode('uggc://jjj.nre.pn/qngn-naq-choyvpngvbaf/npgvivgl-naq-qngn/fg97', 'rot_13') session = requests.Session() source_soup = get_soup(source_url, session) daily_links = find_daily_links(source_soup, last_seen_ref) for date_ref, url in daily_links: data = { 'sample_date': str(datetime.date.today()), 'source_url': url, 'date': datetime.datetime.strptime(date_ref, '%Y%m%d').strftime('%Y-%m-%d') # this is the date the data was released not when it was scraped
# converts date to yyyy-mm-dd or returns original string def parse_date(value): if "/" in value: value_parts = value.split("/") if len(value_parts) == 3: return value_parts[2] + "-" + value_parts[0].zfill(2) + "-" + value_parts[1].zfill(2) else: return value else: return value # get going sample_date = str(date.today()) turbotlib.log("Starting run on " + sample_date) # Optional debug logging # load the main details from the csv file csv_doc = requests.get(csv_url).content.splitlines() reader = csv.DictReader(csv_doc) for row in reader: # create an object to store the details output = {"source_url": csv_url, "sample_date": sample_date, "source": "Central Bank of Nigeria"} # add new data to the object for field, value in row.items(): if field is not None: if len(str(value)) > 0: if value != "1/1/1900": # convert key to standard jsonish type key_name = str(field).lower().replace(" ", "_")
turbotlib.save_var("page", page_number) turbotlib.log("Run finished!") reset_state() def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d # ---------------------------------------------------------------------------------------------------------------------- turbotlib.log("Starting run...") # create individuals cache usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir()) usersDB.row_factory = dict_factory usersDB.execute( "CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)" ) usersDB.commit() turbotlib.log("Getting initial view state...") init_req = retrieve(url_start, "GET", "") document = BeautifulSoup(init_req.text) last_view_state = urllib.quote(document.find(id='__VIEWSTATE')['value']) last_validation = urllib.quote(document.find(id='__EVENTVALIDATION')['value'])
def get_registered_individuals(url, control_href, view_state, firm_jurisdiction, firm_name): return_array = [] turbotlib.log("Retrieving individuals for current or historical firm: " + firm_name + " in: " + firm_jurisdiction) control_id = urllib.quote( control_href.replace("javascript:__doPostBack('", '').replace("','')", '')) individuals_page_req = retrieve( url, "POST", generate_body_control(control_id, view_state)) if "Your search returned no records, please try searching again" in individuals_page_req.text: return [] num_individuals = get_record_count(individuals_page_req.text) processed_individuals = 0 last_processed_individuals = 0 ind_page = 1 while True: individuals_view_state = { 'view': urllib.quote( get_asp_resp_var(individuals_page_req.text, "__VIEWSTATE")), 'validation': urllib.quote( get_asp_resp_var(individuals_page_req.text, "__EVENTVALIDATION")), 'generator': urllib.quote( get_asp_resp_var(individuals_page_req.text, "__VIEWSTATEGENERATOR")) } individual_links = BeautifulSoup( individuals_page_req.text).select('tr > td > a') for link in individual_links: try: if "lbtnIndDetail" not in link['href']: continue except: continue processed_individuals += 1 name = link.text.strip() individual_dict = get_individual(name, firm_jurisdiction, firm_name) if individual_dict is None: get_and_store_individuals_for_firm(link, url, individuals_view_state, name) individual_dict = get_individual(name, firm_jurisdiction, firm_name) if individual_dict is not None: return_array.append(individual_dict) else: return_array.append(individual_dict) if processed_individuals < num_individuals: if last_processed_individuals == processed_individuals: turbotlib.log( 'Warning: broke out of possible infinite loop trying to retrieve all individuals for firm.' ) break ind_page += 1 control_id = urllib.quote( 'ctl00$bodyContent$lbtnPager{0}'.format(ind_page)) individuals_page_req = retrieve( url, "POST", generate_body_control(control_id, individuals_view_state)) last_processed_individuals = processed_individuals else: break return return_array
import datetime import turbotlib import requests import urlparse import re from BeautifulSoup import BeautifulSoup from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter, PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTTextLineHorizontal, LTTextBoxHorizontal, LTChar, LTRect, LTLine, LTAnno, LTCurve from pdfminer.pdfpage import PDFPage from cStringIO import StringIO from pprint import pprint turbotlib.log("Starting run...") # Optional debug logging URL_WITH_PDF_LINKS = 'http://www.ocif.gobierno.pr/concesionariosbusqueda_eng.htm' # Basic idea of the pdf parser is from https://blog.scraperwiki.com/2012/06/pdf-table-extraction-of-a-table/ config = { u'documents/cons/IA.pdf': { 'enabled': False, 'unique_column_name': 'LIC.NUM.', 'name_column_name': u'NAME', 'merge': False,
if (len(naughty_object_list) > 0): print("") print_objects(naughty_object_list) #print all the unique values for a given field def print_field(objects, field): field_values = [] for item in objects: if (field in item): if (item[field] not in field_values): field_values.append(item[field]) for value in field_values: print(value) #START DOING STUFF turbotlib.log("Starting run on " + sample_date + "...") # Optional debug logging #SOURCE URLS #list of links to navigation pages with a list of regulated entities. Stored as a list of these: [url, description] detailURLs = [ ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/commercial-and-savings-banks", "Commercial and Savings Banks in Slovakia"], ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-foreign-banks/banks", "Branch offices of Foreign Banks in Slovakia"], ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-foreign-banks/credit-cooperatives", "Branch offices of Foreign Credit Cooperatives in Slovakia"], ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-slovak-banks-operating-abroad", "Branch offices of Slovak Banks operating aboard"], ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/slovak-banks-providers-of-services-on-the-cross-border-basis-abroad", "Slovak banks providing services on the cross-border basis abroad"], ["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/banks-in-special-proceedings", "Banks in special proceedings"] ] #list of links to pages which contain details of more than one regulated entity. Stored as a list of these [url, description, whether the entity name is in an h4 above the table] multipleDetailURLs = [