def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'faulkner' suffix = '?grp={}' req = requests.get(urlAddress) #Extract the HTML store_source = req.content soup = BeautifulSoup(store_source, 'lxml') #Extract number of inmates: inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text)) #10 entries per page; get number of pages by dividing by 10, rounding up. num_pages = math.ceil(inmate_roster/10) #Toggle local/s3 storage for page in range(0, num_pages): page_index += 1 time.sleep(np.random.uniform(5,10,1)) req = requests.get(urlAddress+suffix.format((page+1)*10)) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: http://sheriff.oktibbeha.ms.us/public/arrests NEW URL: http://sheriff.oktibbeha.ms.us/webinfo/arrests.html """ logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.save_single_page( roster_row) # try to call a known crawler if possible ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) days = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY'] pages = [] for day in days: req = requests.get(urlAddress.replace('FRIDAY', day)) pdf_data = req.content save_to_s3(pdf_data, day, roster_row, filetype='pdf') logger.info('saved page _%s_', day) #Wait time.sleep(np.random.uniform(5,10,1)) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Open Browser browser.get(urlAddress) time.sleep(np.random.uniform(5,7,1)) #Extract the HTML# store_source = browser.page_source ## Code to save the first page and log appropriately save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: """ OLD URL: https://omsweb.public-safety-cloud.com/jtclientweb/(S(3zu0pgqfrm1j0s501pktqn15))/jailtracker/index/Burleigh_County_ND UPDATED URL: https://omsweb.public-safety-cloud.com/jtclientweb/jailtracker/index/Burleigh_County_ND""" logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.omsweb_crawler( roster_row) # try to call a known crawler if possible ## Code to save a page and log appropriately #save_to_s3(store_source, page_index, roster_row) #logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row[ 'Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #################################### # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.basic_multipage(roster_row, next_type='ptext', next_string='»') # End core specific scraping code #################################### #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'norman' #Wait time.sleep(np.random.uniform(5, 10, 1)) pdf_link = browser.find_element_by_xpath( "//*[contains(@href, 'Inmate_Roster')]") pdf_link.click() #new_tab = browser.find_element_by_xpath('/html/body/div/div/div/table/tbody/tr/td[2]/div/div[2]/div/a') #new_tab.click() #Wait time.sleep(np.random.uniform(5, 10, 1)) pdf_data = browser.page_source save_to_s3(pdf_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page #browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'ouachita' suffix = '?id={}' letters = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] pages = [] for letter in letters: req = requests.get('http://' + urlAddress + suffix.format(letter)) store_source = req.content pages.append(store_source) save_to_s3(store_source, letter, roster_row) logger.info('Saved page _%s_', letter) time.sleep(np.random.uniform(5, 10, 1)) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function #browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) req = requests.get(urlAddress) store_source = req.content soup = BeautifulSoup(store_source, 'lxml') pdf_links = soup.find_all("a", href=re.compile("pdf")) new_url = pdf_links[4]['href'] req2 = requests.get(new_url) pdf_data = req2.content ## Code to save a page and log appropriately save_to_s3(store_source, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function browser = get_browser() # Get a standard browser urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) browser.get(urlAddress) date_str = datetime.strftime(datetime.today(), '%b %d, %Y') #date_str = 'May 12, 2020' logger.info('set date string to _%s_',date_str) link = browser.find_element_by_partial_link_text(date_str) link.click() pdf_data = browser.page_source ## Code to save a page and log appropriately save_to_s3(pdf_data, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'amador' time.sleep(np.random.uniform(5, 10, 1)) elem = browser.find_element_by_xpath('//*[@id="cmdCloseMessage"]') elem.click() time.sleep(np.random.uniform(5, 10, 1)) inmates_link = browser.find_element_by_xpath('//*[@id="InmatesMenu"]') inmates_link.click() time.sleep(np.random.uniform(5, 10, 1)) save_to_s3(browser.page_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ Page has been moved. OLD URL: http://www.leecosheriff.com/Inmates/ICURRENT.HTM NEW URL: https://tcsi-roster.azurewebsites.net/default.aspx?i=26&code=Lee&type=roster """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'lee' req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: https://www.co.washington.ar.us/government/departments-f-z/sheriff/detention-information/detainee-roster-detailed NEW URL: https://www.so.washington.ar.us/res/DAlphaRoster.aspx """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'arkansas' assert roster_row['County'].lower() == 'washington' req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'scott' time.sleep(np.random.uniform(5, 10, 1)) """ IFRAME OLD URL: https://www.scottcountymn.gov/1583/Jail-Roster NEW URL: https://www2.co.scott.mn.us/jailroster/custody1.html """ store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: http://randolphcountyso.org/inmates.html NEW URL: http://randolphcountyso.org/cur_inmates.html """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'randolph' page_index = 0 req = requests.get(urlAddress) store_source = req.content save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/ UPDATED URL: https://www.calcoso.org/inmate-roster/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code page_index = 1 assert roster_row['State'].lower() == 'alabama' assert roster_row['County'].lower() == 'clarke' req = requests.get(urlAddress) save_to_s3(req.content, page_index, roster_row) logger.info('Saved page _%s_', page_index) store_source = req.content # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ IFRAME SITE OLD URL: http://phillipscosheriff.com/Inmates NEW URL: http://phillips.pixelpowerhaus.net/ """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'phillips' store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger ########## # Begin core specific scraping code if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.omsweb_crawler(roster_row) # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'morrison' """ Old URL: https://www.co.morrison.mn.us/?SEC=35BA1570-F608-40A9-9571-6968DD357BF6 New URL: https://incustody.co.morrison.mn.us/ """ time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) # Extract the HTML using basic_multipage crawlers.basic_multipage( roster_row, next_type="xpath", next_string='//*[@id="JailRosterbuttondiv"]/a[8]' ) # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'otter tail' """ OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html """ time.sleep(np.random.uniform(5, 10, 1)) store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'california' assert roster_row['County'].lower() == 'kings' time.sleep(np.random.uniform(5, 10, 1)) dropdown = browser.find_element_by_xpath( '//*[@id="DataTables_Table_0_length"]/label/select') dropdown.send_keys('All', Keys.RETURN) time.sleep(np.random.uniform(5, 10, 1)) save_to_s3(browser.page_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'louisiana' assert roster_row['County'].lower() == 'iberville' time.sleep(np.random.uniform(5, 10, 1)) search = browser.find_element_by_xpath( '//*[@id="right"]/center/form/table/tbody/tr[14]/td[2]/input') search.click() #Extract the HTML store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: """ OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/ UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso """ logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'colorado' assert roster_row['County'].lower() == 'elbert' store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'crow wing' #Use elements like below to find xpath keys and click through # NOTE: Looks like there's a site problem with a 404. Here's the FAQ: # https://www.crowwing.us/Faq.aspx?QID=297 to pointing: # https://www.crowwing.us/Faq.aspx?QID=297 store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'cottonwood' time.sleep(np.random.uniform(5, 10, 1)) expand = browser.find_element_by_xpath( '//*[@title="Internet_Jail_Roster.pdf"]') expand.click() store_source = browser.page_source save_to_s3(store_source, page_index, roster_row, filetype='pdf') logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) # Get a standard logger # Here are standard variable values/how to initialize them. # These aren't initialized here since in the save_single_page # case, they can be done in the called function ########## # Begin core specific scraping code if roster_row['State'].lower( ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY: raise Exception( "Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row)) crawlers.omsweb_crawler( roster_row) # try to call a known crawler if possible # End core specific scraping code ########## #Close the browser logger.info('complete!') except Exception as errorMessage: try: browser.close() record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row) # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'isanti' """SITE USES IFRAME ORIGINAL: 'https://www.co.isanti.mn.us/425/In-Custody' SOURCE: 'https://sheriff.co.isanti.mn.us/letg/custody.html' """ store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def main(roster_row): try: logger = get_logger(roster_row) browser = get_browser() urlAddress = roster_row['Working Link'] page_index = 0 logger.info('Set working link to _%s_', urlAddress) #Boilerplate code setting up logger, getting initial URL #Given the urlAddress passed to the function we will navigate to the page browser.get(urlAddress) ########## # Begin core specific scraping code assert roster_row['State'].lower() == 'minnesota' assert roster_row['County'].lower() == 'faribault' """SITE USES IFRAME OLD URL: http://www.frcsd.org/index.php?option=com_wrapper&view=wrapper&Itemid=7 NEW URL: http://www.bevcommasp.com/fcjail/custodylistFar.rpt.html """ store_source = browser.page_source save_to_s3(store_source, page_index, roster_row) logger.info('Saved page _%s_', page_index) # End core specific scraping code ########## #Close the browser logger.info('complete!') browser.close() except Exception as errorMessage: browser.close() record_error(str(errorMessage), page_index, roster_row) # Record error in S3 logger.error('Error: %s', errorMessage) # Log error pass
def save_single_page(roster_row, filetype='html'): try: logger = get_logger(roster_row) # Get a standard logger browser = get_browser() # Get a standard browser logger.info('using save_single_html_page for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL #Boilerplate code setting up logger, getting initial URL time.sleep(np.random.uniform(5,10,1)) #Given the urlAddress passed to the function we will navigate to the page if filetype=='html': browser.get(urlAddress) store_source = browser.page_source elif filetype=='xls': browser.get(urlAddress) store_source = browser.page_source else: response = requests.get(urlAddress) response.raise_for_status() store_source = response.content save_to_s3(store_source, page_index, roster_row, filetype=filetype) # Safe result to s3. This call includes logging and file formatting logger.info('Saved page _%s_', page_index) return True except Exception as errorMessage: try: record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser) except: record_error(message=str(errorMessage), roster_row=roster_row, browser=browser) browser.close() # Record error in S3 for a general error logger.error('Error: %s', errorMessage) # Log error sys.exit(1)