def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page( roster_row, filetype='html') # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page(roster_row) # try to call a known crawler if possible ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page( roster_row) # try to call a known crawler if possible browser.get(urlAddress) #Show all inmates instead of 6 per page time.sleep(np.random.uniform(5, 10, 1)) show_all = browser.find_element_by_xpath('//*[@id="inmatesPerPage"]') show_all.send_keys('All') logger.info('clicked "All"') #Wait time.sleep(np.random.uniform(15, 20, 1)) #Extract the HTML store_source = browser.page_source ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)