def base(): # 1. Initialize queue cpa_queue = url_queue.URLSearchQueue() # 1a. Initialize firm_list (consider making this a MySQL d/b) set_of_emails = set([]) driver = webdriver.PhantomJS() # 3. Add START_URL(s) to queue for start_url in start_url_list: cpa_queue.enqueue(start_url) # 5b. scrape tree external_sites, list_of_firms = scrape_cpa_tree(cpa_queue, driver) with open('firm_list.csv', 'w') as csvfile: fieldnames = ['firm_details', 'firm_url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for firm in list_of_firms: writer.writerow(firm) # 8. Go through each external URL and scrape emails while len(external_sites) > 0: active_queue = external_sites.pop() set_of_emails.update(process_external_url_queue(active_queue, driver)) with open('email_list.csv', 'w') as csvfile: fieldnames = ['email'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for email in set_of_emails: try: writer.writerow(email) except Exception as exc: print('failed to write email %s' % email)
def main(): # 1. instantiate stuff canada_queue = url_queue.URLSearchQueue() list_of_external_queues = [] set_of_external_base_urls = set([]) email_set = set([]) list_of_firms = [] driver = webdriver.PhantomJS() url = 'https://www.cpacanada.ca/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm' url_start = '/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm' # 2. get internal firm links from base page driver.get(url) tree = BeautifulSoup(driver.page_source, 'lxml') for url in extract_firm_page_urls(tree, url_start): canada_queue.enqueue(url) # 3. grab relevant info from each individual firm listing n = 0 while canada_queue.queue_len() > 0: n += 1 if n % 100 == 0: print('processed %s cpacanada pages' % n) curr_url = canada_queue.dequeue() firm_name, firm_details, email_list, web_list = scrape_for_firm_info( curr_url, driver) if len(web_list) > 0: for site in web_list: if site is not None and 'linkedin' not in site \ and 'facebook' not in site: if site[:4] != 'http': site = 'http://' + site update_external_queue(list_of_external_queues, set_of_external_base_urls, site) if len(email_list) > 0: email_set.update(email_list) list_of_firms.append({ 'firm_name': firm_name, 'firm_details': firm_details }) connection = pymysql.connect(host=HOST, password=PASSWORD, port=PORT, user=USER, db=DB) sql = 'INSERT INTO emails VALUES (%s)' with connection.cursor() as cursor: for email in email_set: try: cursor.execute(sql, email) except Exception as exc: print('Error: %s \nfailed to write %s' % (exc, email)) connection.commit()
def update_external_queue(list_of_queues, set_of_urls, new_url): parsed_url = urlparse(new_url) base_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_url) new_queue = url_queue.URLSearchQueue() if base_url not in set_of_urls: new_queue.enqueue(base_url) set_of_urls.add(base_url) if new_url != base_url: new_queue.enqueue(new_url) set_of_urls.add(new_url) list_of_queues.append(new_queue)
def main(): # 1. instantiate stuff canada_queue = url_queue.URLSearchQueue() list_of_external_queues = [] set_of_external_base_urls = set([]) email_set = set([]) list_of_firms = [] driver = webdriver.PhantomJS() url = 'https://www.cpacanada.ca/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm' url_start = '/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm' # 2. get internal firm links from base page driver.get(url) tree = BeautifulSoup(driver.page_source, 'lxml') for url in extract_firm_page_urls(tree, url_start): canada_queue.enqueue(url) # 3. grab relevant info from each individual firm listing n = 0 while canada_queue.queue_len() > 0: n += 1 if n % 100 == 0: print('processed %s cpacanada pages' % n) curr_url = canada_queue.dequeue() firm_name, firm_details, email_list, web_list = scrape_for_firm_info( curr_url, driver ) if len(web_list) > 0: for site in web_list: if site is not None and 'linkedin' not in site \ and 'facebook' not in site: if site[:4] != 'http': site = 'http://' + site update_external_queue(list_of_external_queues, set_of_external_base_urls, site) if len(email_list) > 0: email_set.update(email_list) list_of_firms.append({'firm_name': firm_name, 'firm_details': firm_details}) # with open('canada_firm_list.csv', 'w') as csvfile: # fieldnames = ['firm_name', 'firm_details'] # writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # writer.writeheader() # for firm in list_of_firms: # writer.writerow(firm) # 4. crawl each firm site for emails while len(list_of_external_queues) > 0: active_queue = list_of_external_queues.pop() email_set.update(process_external_url_queue(active_queue, driver))
def main(argv): driver = webdriver.PhantomJS() file_name = argv[1] if len(argv) > 1 else FILENAME url_set = create_site_set(file_name) total_urls = len(url_set) print('======================\nTotal urls to crawl: %s' '\n=======================' % str(total_urls)) url_queue_list = [] for url in url_set: url_queue_list.append(url_queue.URLSearchQueue(url)) n = 0 for queue in url_queue_list: n += 1 process_external_url_queue(queue, driver) if n % 100 == 0: print('\n%s urls to go!\n' % str(total_urls - n))
def scrape_cpa_tree(queue, driver=None): firm_list = [] set_of_external_urls = set([]) list_of_external_url_queues = [] n = 0 while queue.queue_len() > 0: curr_url = queue.dequeue() n += 1 if n % 100 == 0: print('%s base site pages scraped' % n) page_tree = parse_page.fetch_page(curr_url) if page_tree is not None: java_crawled = False url_list = parse_page.extract_urls(page_tree) for url in url_list: # This is a link to firm details if url[:36] == "javascript:open_window('details.aspx": queue.enqueue(JAVA_PREFIX + url[24:len(url) - 2]) # Deal with paginated lists elif url[:23] == 'javascript:__doPostBack' and not java_crawled: java_crawled = True java_urls = java_page_scraper.load_javascript_page( curr_url, 'javascript:__doPostBack', driver) for new_url in java_urls: queue.enqueue(new_url) # Enqueue links to same site elif url[:len(SITE_PREFIX)] == SITE_PREFIX: queue.enqueue(url) # Put external links into a separate queue if 'details.aspx?searchnumber=' in curr_url: parsed_url = urlparse(url) external_base_url = \ '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_url) external_url_queue = url_queue.URLSearchQueue() if external_base_url not in set_of_external_urls: external_url_queue.enqueue(external_base_url) set_of_external_urls.add(external_base_url) if url != external_base_url and \ url not in set_of_external_urls: external_url_queue.enqueue(url) set_of_external_urls.add(url) if external_url_queue.queue_len() > 0: list_of_external_url_queues.append(external_url_queue) # if curr_url is detail page, extract firm info and add to firm_list if 'details.aspx?searchnumber=' in curr_url: firm_list.append(extract_firm_info(page_tree)) return list_of_external_url_queues, firm_list