Пример #1
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'faulkner'
        suffix = '?grp={}'
        req = requests.get(urlAddress)
        #Extract the HTML
        store_source = req.content
        soup = BeautifulSoup(store_source, 'lxml')
        #Extract number of inmates:
        inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text))
        #10 entries per page; get number of pages by dividing by 10, rounding up.
        num_pages = math.ceil(inmate_roster/10)
        #Toggle local/s3 storage
        for page in range(0, num_pages):
            page_index += 1
            time.sleep(np.random.uniform(5,10,1))
            req = requests.get(urlAddress+suffix.format((page+1)*10))
            store_source = req.content
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #2
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: http://sheriff.oktibbeha.ms.us/public/arrests
        NEW URL: http://sheriff.oktibbeha.ms.us/webinfo/arrests.html
        
        
        """
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(
            roster_row)  # try to call a known crawler if possible
        ## Code to save a page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        browser.close()
        record_error(message=str(errorMessage),
                     roster_row=roster_row,
                     browser=browser)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #3
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        days = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY',
                'FRIDAY', 'SATURDAY', 'SUNDAY']
        
        pages = []
        
        for day in days:
            
        
            req = requests.get(urlAddress.replace('FRIDAY', day))
            pdf_data = req.content
            save_to_s3(pdf_data, day, roster_row, filetype='pdf')
            logger.info('saved page _%s_', day)
            #Wait
            time.sleep(np.random.uniform(5,10,1))
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #4
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL
        
        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        
        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(5,7,1))
        
        #Extract the HTML#
        store_source = browser.page_source
        
        ## Code to save the first page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #5
0
def main(roster_row):
    try:
        """
        OLD URL: https://omsweb.public-safety-cloud.com/jtclientweb/(S(3zu0pgqfrm1j0s501pktqn15))/jailtracker/index/Burleigh_County_ND
        UPDATED URL: https://omsweb.public-safety-cloud.com/jtclientweb/jailtracker/index/Burleigh_County_ND"""
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.omsweb_crawler(
            roster_row)  # try to call a known crawler if possible
        ## Code to save a page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #6
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.basic_multipage(roster_row,
                                 next_type='ptext',
                                 next_string='»')
        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #7
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'norman'
        #Wait
        time.sleep(np.random.uniform(5, 10, 1))

        pdf_link = browser.find_element_by_xpath(
            "//*[contains(@href, 'Inmate_Roster')]")
        pdf_link.click()

        #new_tab = browser.find_element_by_xpath('/html/body/div/div/div/table/tbody/tr/td[2]/div/div[2]/div/a')
        #new_tab.click()

        #Wait
        time.sleep(np.random.uniform(5, 10, 1))

        pdf_data = browser.page_source
        save_to_s3(pdf_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #8
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        #browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'ouachita'
        suffix = '?id={}'

        letters = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]
        pages = []

        for letter in letters:
            req = requests.get('http://' + urlAddress + suffix.format(letter))
            store_source = req.content
            pages.append(store_source)
            save_to_s3(store_source, letter, roster_row)
            logger.info('Saved page _%s_', letter)
            time.sleep(np.random.uniform(5, 10, 1))
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #9
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        req = requests.get(urlAddress)
        store_source = req.content
        
        soup = BeautifulSoup(store_source, 'lxml')
        pdf_links = soup.find_all("a", href=re.compile("pdf"))
        new_url = pdf_links[4]['href']
        
        req2 = requests.get(new_url)
        pdf_data = req2.content
        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #10
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        browser.get(urlAddress)
        date_str = datetime.strftime(datetime.today(), '%b %d, %Y')
        #date_str = 'May 12, 2020'
        logger.info('set date string to _%s_',date_str)
        link = browser.find_element_by_partial_link_text(date_str)
        link.click()
        pdf_data = browser.page_source
        
        ## Code to save a page and log appropriately
        save_to_s3(pdf_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #11
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'amador'

        time.sleep(np.random.uniform(5, 10, 1))
        elem = browser.find_element_by_xpath('//*[@id="cmdCloseMessage"]')
        elem.click()

        time.sleep(np.random.uniform(5, 10, 1))
        inmates_link = browser.find_element_by_xpath('//*[@id="InmatesMenu"]')
        inmates_link.click()

        time.sleep(np.random.uniform(5, 10, 1))
        save_to_s3(browser.page_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #12
0
def main(roster_row):
    try:
        """
        Page has been moved.
        
        OLD URL: http://www.leecosheriff.com/Inmates/ICURRENT.HTM
        NEW URL: https://tcsi-roster.azurewebsites.net/default.aspx?i=26&code=Lee&type=roster
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'lee'

        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #13
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: https://www.co.washington.ar.us/government/departments-f-z/sheriff/detention-information/detainee-roster-detailed
        NEW URL: https://www.so.washington.ar.us/res/DAlphaRoster.aspx
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'washington'
        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #14
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'scott'
        time.sleep(np.random.uniform(5, 10, 1))
        """
        IFRAME
        
        OLD URL: https://www.scottcountymn.gov/1583/Jail-Roster
        NEW URL: https://www2.co.scott.mn.us/jailroster/custody1.html
        
        """
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #15
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: http://randolphcountyso.org/inmates.html
        NEW URL: http://randolphcountyso.org/cur_inmates.html
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'randolph'
        page_index = 0
        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #16
0
def main(roster_row):
    try:
        """
        OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/
        UPDATED URL: https://www.calcoso.org/inmate-roster/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        page_index = 1

        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'clarke'

        req = requests.get(urlAddress)
        save_to_s3(req.content, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        store_source = req.content
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #17
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: http://phillipscosheriff.com/Inmates
        NEW URL: http://phillips.pixelpowerhaus.net/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'phillips'
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #18
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.omsweb_crawler(roster_row)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        browser.close()
        record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #19
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'morrison'
        """
        Old URL: https://www.co.morrison.mn.us/?SEC=35BA1570-F608-40A9-9571-6968DD357BF6
        New URL: https://incustody.co.morrison.mn.us/
            
        """
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #20
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        ##########
        # Begin core specific scraping code

        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Extract the HTML using basic_multipage
        crawlers.basic_multipage(
            roster_row,
            next_type="xpath",
            next_string='//*[@id="JailRosterbuttondiv"]/a[8]'
        )  # try to call a known crawler if possible

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #21
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'otter tail'
        """
        OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List
        NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html
        
        """
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #22
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'kings'

        time.sleep(np.random.uniform(5, 10, 1))
        dropdown = browser.find_element_by_xpath(
            '//*[@id="DataTables_Table_0_length"]/label/select')
        dropdown.send_keys('All', Keys.RETURN)
        time.sleep(np.random.uniform(5, 10, 1))

        save_to_s3(browser.page_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #23
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'iberville'
        time.sleep(np.random.uniform(5, 10, 1))
        search = browser.find_element_by_xpath(
            '//*[@id="right"]/center/form/table/tbody/tr[14]/td[2]/input')
        search.click()

        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #24
0
def main(roster_row):
    try:
        """
        OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/
        UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'elbert'
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #25
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'crow wing'
        #Use elements like below to find xpath keys and click through
        # NOTE: Looks like there's a site problem with a 404. Here's the FAQ:
        # https://www.crowwing.us/Faq.aspx?QID=297 to pointing:
        # https://www.crowwing.us/Faq.aspx?QID=297

        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #26
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'cottonwood'
        time.sleep(np.random.uniform(5, 10, 1))
        expand = browser.find_element_by_xpath(
            '//*[@title="Internet_Jail_Roster.pdf"]')
        expand.click()

        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #27
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.omsweb_crawler(
            roster_row)  # try to call a known crawler if possible

        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Пример #28
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'isanti'
        """SITE USES IFRAME
        ORIGINAL: 'https://www.co.isanti.mn.us/425/In-Custody'
        SOURCE: 'https://sheriff.co.isanti.mn.us/letg/custody.html'
        """
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #29
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'faribault'
        """SITE USES IFRAME
        OLD URL: http://www.frcsd.org/index.php?option=com_wrapper&view=wrapper&Itemid=7
        NEW URL: http://www.bevcommasp.com/fcjail/custodylistFar.rpt.html
        """
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
Пример #30
0
def save_single_page(roster_row, filetype='html'):
    try:
        logger = get_logger(roster_row) # Get a standard logger
        browser = get_browser() # Get a standard browser
        logger.info('using save_single_html_page for _%s, %s', roster_row['County'], roster_row['State']) # Log the chosen URL

        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL
        #Boilerplate code setting up logger, getting initial URL
        time.sleep(np.random.uniform(5,10,1))

        #Given the urlAddress passed to the function we will navigate to the page
        if filetype=='html':
            browser.get(urlAddress) 
            store_source = browser.page_source
        elif filetype=='xls':
            browser.get(urlAddress) 
            store_source = browser.page_source
        else:
            response = requests.get(urlAddress)
            response.raise_for_status()
            store_source = response.content
        save_to_s3(store_source, page_index, roster_row, filetype=filetype) # Safe result to s3. This call includes logging and file formatting
        logger.info('Saved page _%s_', page_index)
        return True
    except Exception as errorMessage:
        try:
            record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        browser.close()
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)