def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.roster_php( roster_row) # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.roster_php(roster_row) ## Open Browser #browser.get(urlAddress) #time.sleep(np.random.uniform(7,10,1)) ##Extract the HTML #store_source = browser.page_source ### Code to save the first page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) ##Finding the last page #soup = BeautifulSoup(store_source, 'lxml') #page=0 #list_text=[] #for link in soup.findAll("span", {"class":"ptitles"}): #page=str(link.text) #list_text.append(page) #page=list_text[0] #page=re.sub('Inmate Roster ', "", page) #page=re.sub(' ', "", page) #page=re.sub(r'\)',"", page) #page=re.sub(r'\(',"", page) #page=int(page)/10 #page=math.ceil(page) ##Crawling through all the pages #string = str(1) ## Limit to 20 page #for i in range(2,page+1): #browser.get('https://www.prattcountysheriff.com/roster.php?grp='+str(i*10)) #time.sleep(np.random.uniform(7,10,1)) #store_source = browser.page_source #string=str(i) ### Code to save a page and log appropriately #page_index = int(string) - 1 #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) ## End core specific scraping code ##################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)