예제 #1
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'carroll'
        browser.get(urlAddress)
        pages = []
        #Use elements like below to find xpath keys and click through
        #Click I agree to terms
        time.sleep(np.random.uniform(5, 10, 1))
        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        pages.append(store_source)
        page_index += 1
        finished = False
        while not finished:
            try:
                nextpage = browser.find_element_by_link_text('Next >')
                nextpage.click()
                time.sleep(np.random.uniform(5, 10, 1))

                #Extract the HTML
                store_source = browser.page_source
                if store_source not in pages:
                    pages.append(store_source)
                    save_to_s3(store_source, page_index, roster_row)
                    logger.info('Saved page _%s_', page_index)
                    page_index += 1
                else:
                    finished = True

            except:
                finished = True
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #2
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'mower'
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #3
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'maine'
        assert roster_row['County'].lower() == 'penobscot'
        req = requests.get(urlAddress)
        page_data = req.content
        save_to_s3(page_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #4
0
def main(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower() != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception("Expected county definition info from _%s, %s_, but found info: _%s_" % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.jailinmates_aspx(roster_row)

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #5
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))

        # Open Browser
        browser.get(urlAddress)
        time.sleep(np.random.uniform(5, 10, 1))

        #Use elements like below to find xpath keys and click through
        #Click "SELECT AN INMATE"
        elem = browser.find_element_by_xpath('//*[@id="dropdownMenuButton"]')
        elem.click()
        time.sleep(np.random.uniform(2, 4, 1))

        #Extract the HTML#
        store_source = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)

        # End core specific scraping code

        ####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #6
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        browser.get(urlAddress)

        #Wait
        time.sleep(np.random.uniform(5, 10, 1))
        #Extract the HTML

        link = browser.find_element_by_partial_link_text("Inmate List")
        link.click()

        time.sleep(np.random.uniform(5, 10, 1))
        link = browser.find_element_by_partial_link_text("Jail Listing")
        link.click()

        pdf_data = browser.page_source

        ## Code to save a page and log appropriately
        save_to_s3(pdf_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #7
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.save_single_page(
            roster_row)  # try to call a known crawler if possible
        browser.get(urlAddress)

        #Show all inmates instead of 6 per page
        time.sleep(np.random.uniform(5, 10, 1))
        show_all = browser.find_element_by_xpath('//*[@id="inmatesPerPage"]')
        show_all.send_keys('All')
        logger.info('clicked "All"')

        #Wait
        time.sleep(np.random.uniform(15, 20, 1))

        #Extract the HTML
        store_source = browser.page_source
        ## Code to save a page and log appropriately
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #8
0
def main(roster_row):
    try:
        """
        Marion County Alabama, not Missouri
        
        OLD URL: http://mcsomo.com/current-inmates/
        NEW URL: http://www.marionsoal.com/roster.php
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'marion'
        suffix = '?grp={}'
        req = requests.get(urlAddress)

        #Extract the HTML
        store_source = req.content
        soup = BeautifulSoup(store_source, 'lxml')
        #Extract number of inmates:
        inmate_roster = int(
            re.sub("\D", "",
                   soup.find('span', {
                       "class": "ptitles"
                   }).text))
        #20 entries per page; get number of pages by dividing by 20 rounding up.
        num_pages = math.ceil(inmate_roster / 20)

        for page_index in range(0, num_pages):

            time.sleep(np.random.uniform(5, 10, 1))
            req = requests.get(urlAddress +
                               suffix.format((page_index + 1) * 20))
            store_source = req.content
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #9
0
def omsweb_crawler(roster_row):
    try:
        logger = get_logger(roster_row) # Get a standard logger
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        if 'omsweb' not in urlAddress:
            raise Exception("Appears that this site _%s_ is not a public safety web site" % urlAddress)
        logger.info('using omsweb_crawler for _%s, %s_', roster_row['County'], roster_row['State']) # Log the chosen URL
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        browser.get(urlAddress)  
        time.sleep(np.random.uniform(5,10,1))
        
        pages = []
        
        store_source = browser.page_source
        pages.append(store_source)

        finished = False
        
        while not finished:
            
            try:
                nextpage = browser.find_element_by_xpath('//*[@id="ext-gen110"]')
                nextpage.click()
                time.sleep(np.random.uniform(5,10,1))
                store_source = browser.page_source
                if store_source not in pages:
                    pages.append(store_source)
                else:
                    finished = True
                
            except:
                finished = True

        #Close the browser
        browser.close()

        for store_source, page_index in zip(pages, range(len(pages))):
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)

        logger.info('complete!')
    except:
        try:
            try:
                page_index = len(pages)
            except:
                page_index = 0
            record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        browser.close()
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #10
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'washington'
        suffix = '?grp={}'
        req = requests.get(urlAddress)
        #Extract the HTML
        store_source = req.content
        soup = BeautifulSoup(store_source, 'lxml')
        #Extract number of inmates:
        inmate_roster = int(
            re.sub("\D", "",
                   soup.find('span', {
                       "class": "ptitles"
                   }).text))
        #10 entries per page; get number of pages by dividing by 20, rounding up.
        num_pages = math.ceil(inmate_roster / 10)
        #Toggle local/s3 storage
        for page in range(0, num_pages):

            page_index += 1
            time.sleep(np.random.uniform(5, 10, 1))
            req = requests.get(urlAddress + suffix.format((page + 1) * 10))
            store_source = req.content
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
        ###
        req = requests.get(urlAddress)
        page_data = req.content
        save_to_s3(page_data, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #11
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        page_index = 1

        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'houston'

        time.sleep(np.random.uniform(5, 10, 1))
        dropdown = browser.find_element_by_xpath(
            '//*[@id="gvInmates_DXPagerBottom_PSI"]')
        dropdown.click()
        dropdown.send_keys(Keys.DOWN)
        time.sleep(np.random.uniform(5, 10, 1))

        dropdown = browser.find_element_by_xpath(
            '//*[@id="gvInmates_DXPagerBottom_PSI"]')
        dropdown.send_keys(Keys.DOWN)
        time.sleep(np.random.uniform(5, 10, 1))

        dropdown = browser.find_element_by_xpath(
            '//*[@id="gvInmates_DXPagerBottom_PSI"]')
        dropdown.send_keys(Keys.RETURN)
        time.sleep(np.random.uniform(5, 10, 1))

        #Extract the HTML
        store_source = browser.page_source

        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #12
0
def main(roster_row):
    try:
        """
        OLD URL: https://www.boonesheriff.com/mobile/roster.php
        UPDATED URL: https://www.boonesheriff.com/roster.php
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'boone'
        suffix = '?grp={}'
        req = requests.get(urlAddress)
        #Extract the HTML
        store_source = req.content
        soup = BeautifulSoup(store_source, 'lxml')
        #Extract number of inmates:
        inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text))
        #10 entries per page; get number of pages by dividing by 10, rounding up.
        num_pages = math.ceil(inmate_roster/10)
        #Mark the time the file is collected
        for page in range(0, num_pages):
            page_index += 1
            time.sleep(np.random.uniform(5,10,1))
            req = requests.get(urlAddress+suffix.format((page+1)*10))
            store_source = req.content
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #13
0
def main(roster_row):
    try:
        """
        PDF LINK IS OUT OF DATE
        
        OLD URL: http://www.yolocountysheriff.com/wp-content/uploads/2019/02/January-2019-webiste-upload.pdf
            
        NEW URL: https://www.yolocountysheriff.com/services/jail/booking-statistics/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'yolo'

        req = requests.get(urlAddress)
        store_source = req.content

        soup = BeautifulSoup(store_source, 'lxml')
        link_to_pdf = soup.find('article', {'id': 'post-356'})
        pdf_url = link_to_pdf.find_all('a')[1]['href']

        time.sleep(np.random.uniform(5, 10, 1))
        req2 = requests.get(pdf_url)
        pdf_data = req2.content
        save_to_s3(pdf_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #14
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'caldwell'
        time.sleep(np.random.uniform(5,10,1))
        
        """This agency is providing this roster of incarcerated offenders to
        the public and law enforcement in the interest of public safety. This
        information shall not be considered, or used as, a public document, or
        official document, and no other publication or copying of this
        information is allowed without the express written consent of this
        agency. Any unauthorized use of this information is forbidden and
        subject to criminal prosecution."""
        
        #Click I agree to terms
        elem = browser.find_element_by_xpath('//*[@id="OkButton"]')
        elem.click()
         
        #Wait
        time.sleep(np.random.uniform(10,15,1))
        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #15
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'st. tammany'
        pages = []
        finished = False
        #Pagination is done by number
        page_index = 1

        while finished == False:
            try:
                nextpage = browser.find_element_by_link_text('›')
                nextpage.click()
                page_index += 1
                time.sleep(np.random.uniform(5, 10, 1))

                #Extract the HTML
                store_source = browser.page_source
                save_to_s3(store_source, page_index, roster_row)
                logger.info('Saved page _%s_', page_index)
            except:
                finished = True
        ###
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #16
0
def main(roster_row):
    try:
        """
        If there are no inmates' names for a particular letter, the previous
        letter will be stored.
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'shelby'
        pages = []
        letters = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        pages.append(store_source)
        for letter in letters[0:]:
            pagelink = browser.find_element_by_xpath(
                '//*[@id="btn_{}"]'.format(letter))
            pagelink.click()
            time.sleep(np.random.uniform(5, 10, 1))
            store_source = browser.page_source
            save_to_s3(browser.page_source, letter, roster_row)
            logger.info('Saved page _%s_', letter)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #17
0
def roster_php(roster_row, num_per_page=20):
    try:
        logger = get_logger(roster_row) # Get a standard logger
        browser = get_browser() # Get a standard browser
        urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        if 'roster.php' not in urlAddress:
            raise Exception("Appears that this site _%s_ is not a roster.php website - using the wrong crawler" % urlAddress)
        page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Choosing roster_php crawler') # Name crawler
        logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        suffix = '?grp={}'
        browser.get(urlAddress) 
        #Use elements like below to find xpath keys and click through 
        time.sleep(np.random.uniform(300,400,1))
        store_source = browser.page_source
        soup = BeautifulSoup(store_source, 'lxml')
        try:
            inmate_roster = int(re.sub("\D", "", soup.find('span', {"class":"ptitles"}).text))        #10 entries per page; get number of pages by dividing by 10, rounding up.
        except:
            inmate_roster = int(re.sub("\D", "", soup.find('h2', {"class":"large-6 columns ptitles"}).text))        #10 entries per page; get number of pages by dividing by 10, rounding up.
#">Inmate Roster (151)</h2>
        num_pages = math.ceil(inmate_roster/num_per_page)
        pages = []

        for page in range(0, num_pages):
            
            time.sleep(np.random.uniform(5,10,1))
            url = urlAddress+suffix.format((page+1)*num_per_page)
            logger.info('getting url _%s_', url)
            browser.get(url)
            store_source = browser.page_source
            logger.info('Found page _%s_', page)
            pages.append(store_source)
        for store_source in pages:
            page_index += 1
            save_to_s3(store_source, page_index, roster_row)
            logger.info('Saved page _%s_', page_index)
    except Exception as errorMessage:
        try:
            record_error(message=str(errorMessage), roster_row=roster_row, page_number_within_scrape=page_index, browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row, browser=browser)
        browser.close()
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
예제 #18
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'norman'
        #Wait
        time.sleep(np.random.uniform(5, 10, 1))

        pdf_link = browser.find_element_by_xpath(
            "//*[contains(@href, 'Inmate_Roster')]")
        pdf_link.click()

        #new_tab = browser.find_element_by_xpath('/html/body/div/div/div/table/tbody/tr/td[2]/div/div[2]/div/a')
        #new_tab.click()

        #Wait
        time.sleep(np.random.uniform(5, 10, 1))

        pdf_data = browser.page_source
        save_to_s3(pdf_data, page_index, roster_row, filetype='pdf')
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #19
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        #browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'ouachita'
        suffix = '?id={}'

        letters = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]
        pages = []

        for letter in letters:
            req = requests.get('http://' + urlAddress + suffix.format(letter))
            store_source = req.content
            pages.append(store_source)
            save_to_s3(store_source, letter, roster_row)
            logger.info('Saved page _%s_', letter)
            time.sleep(np.random.uniform(5, 10, 1))
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #20
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'amador'

        time.sleep(np.random.uniform(5, 10, 1))
        elem = browser.find_element_by_xpath('//*[@id="cmdCloseMessage"]')
        elem.click()

        time.sleep(np.random.uniform(5, 10, 1))
        inmates_link = browser.find_element_by_xpath('//*[@id="InmatesMenu"]')
        inmates_link.click()

        time.sleep(np.random.uniform(5, 10, 1))
        save_to_s3(browser.page_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #21
0
def main(roster_row):
    try:
        """
        OLD URL: http://www.calcoso.org/divisions-jail-inmate-roster/
        UPDATED URL: https://www.calcoso.org/inmate-roster/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        page_index = 1

        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'clarke'

        req = requests.get(urlAddress)
        save_to_s3(req.content, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        store_source = req.content
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #22
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: https://www.co.washington.ar.us/government/departments-f-z/sheriff/detention-information/detainee-roster-detailed
        NEW URL: https://www.so.washington.ar.us/res/DAlphaRoster.aspx
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'washington'
        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #23
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: http://randolphcountyso.org/inmates.html
        NEW URL: http://randolphcountyso.org/cur_inmates.html
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress) 

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'alabama'
        assert roster_row['County'].lower() == 'randolph'
        page_index = 0
        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()
        
    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #24
0
def main(roster_row):
    try:
        """
        Page has been moved.
        
        OLD URL: http://www.leecosheriff.com/Inmates/ICURRENT.HTM
        NEW URL: https://tcsi-roster.azurewebsites.net/default.aspx?i=26&code=Lee&type=roster
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'arkansas'
        assert roster_row['County'].lower() == 'lee'

        req = requests.get(urlAddress)
        store_source = req.content
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #25
0
def main(roster_row):
    try:
        """
        IFRAME SITE
        
        OLD URL: http://phillipscosheriff.com/Inmates
        NEW URL: http://phillips.pixelpowerhaus.net/
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'phillips'
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #26
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'otter tail'
        """
        OLD URL: http://www.co.otter-tail.mn.us/991/In-Custody-List
        NEW URL: https://www.ottertailcounty.us/sheriff/report/custody%20list.rpt.html
        
        """
        time.sleep(np.random.uniform(5, 10, 1))
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #27
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'california'
        assert roster_row['County'].lower() == 'kings'

        time.sleep(np.random.uniform(5, 10, 1))
        dropdown = browser.find_element_by_xpath(
            '//*[@id="DataTables_Table_0_length"]/label/select')
        dropdown.send_keys('All', Keys.RETURN)
        time.sleep(np.random.uniform(5, 10, 1))

        save_to_s3(browser.page_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #28
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'louisiana'
        assert roster_row['County'].lower() == 'iberville'
        time.sleep(np.random.uniform(5, 10, 1))
        search = browser.find_element_by_xpath(
            '//*[@id="right"]/center/form/table/tbody/tr[14]/td[2]/input')
        search.click()

        #Extract the HTML
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #29
0
def main(roster_row):
    try:
        """
        OLD URL: https://www.elbertcountysheriff.com/detention/inmateinfo/
        UPDATED URL: https://www.inmateinfo.net/inmateinfo.php?org=ecso
        
        """
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'colorado'
        assert roster_row['County'].lower() == 'elbert'
        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass
예제 #30
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)
        browser = get_browser()
        urlAddress = roster_row['Working Link']
        page_index = 0
        logger.info('Set working link to _%s_', urlAddress)
        #Boilerplate code setting up logger, getting initial URL

        #Given the urlAddress passed to the function we will navigate to the page
        browser.get(urlAddress)

        ##########
        # Begin core specific scraping code
        assert roster_row['State'].lower() == 'minnesota'
        assert roster_row['County'].lower() == 'crow wing'
        #Use elements like below to find xpath keys and click through
        # NOTE: Looks like there's a site problem with a 404. Here's the FAQ:
        # https://www.crowwing.us/Faq.aspx?QID=297 to pointing:
        # https://www.crowwing.us/Faq.aspx?QID=297

        store_source = browser.page_source
        save_to_s3(store_source, page_index, roster_row)
        logger.info('Saved page _%s_', page_index)
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')
        browser.close()

    except Exception as errorMessage:
        browser.close()
        record_error(str(errorMessage), page_index, roster_row)
        # Record error in S3
        logger.error('Error: %s', errorMessage)
        # Log error
        pass