Exemplo n.º 1
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function
        #browser = get_browser() # Get a standard browser
        #urlAddress = roster_row['Working Link'] # Set the main URL from the spreadsheet
        #page_index = 0 # Set an initial value of "page_index", which we will use to separate output pages
        #logger.info('Set working link to _%s_', urlAddress) # Log the chosen URL

        ##########
        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.roster_php(
            roster_row)  # try to call a known crawler if possible
        # End core specific scraping code
        ##########

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        record_error(message=str(errorMessage), roster_row=roster_row)
        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)
Exemplo n.º 2
0
def main(roster_row):
    try:
        logger = get_logger(roster_row)  # Get a standard logger

        # Here are standard variable values/how to initialize them.
        # These aren't initialized here since in the save_single_page
        # case, they can be done in the called function

        browser = get_browser()  # Get a standard browser
        urlAddress = roster_row[
            'Working Link']  # Set the main URL from the spreadsheet
        page_index = 0  # Set an initial value of "page_index", which we will use to separate output pages
        logger.info('Set working link to _%s_',
                    urlAddress)  # Log the chosen URL

        ####################################

        # Begin core specific scraping code
        if roster_row['State'].lower(
        ) != THIS_STATE or roster_row['County'].lower() != THIS_COUNTY:
            raise Exception(
                "Expected county definition info from _%s, %s_, but found info: _%s_"
                % (THIS_COUNTY, THIS_STATE, roster_row))
        crawlers.roster_php(roster_row)

        ## Open Browser
        #browser.get(urlAddress)
        #time.sleep(np.random.uniform(7,10,1))

        ##Extract the HTML
        #store_source = browser.page_source

        ### Code to save the first page and log appropriately
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)

        ##Finding the last page
        #soup = BeautifulSoup(store_source, 'lxml')
        #page=0
        #list_text=[]
        #for link in soup.findAll("span", {"class":"ptitles"}):
        #page=str(link.text)
        #list_text.append(page)
        #page=list_text[0]
        #page=re.sub('Inmate Roster ', "", page)
        #page=re.sub(' ', "", page)
        #page=re.sub(r'\)',"", page)
        #page=re.sub(r'\(',"", page)
        #page=int(page)/10
        #page=math.ceil(page)

        ##Crawling through all the pages
        #string = str(1)
        ## Limit to 20 page
        #for i in range(2,page+1):
        #browser.get('https://www.prattcountysheriff.com/roster.php?grp='+str(i*10))
        #time.sleep(np.random.uniform(7,10,1))
        #store_source = browser.page_source
        #string=str(i)
        ### Code to save a page and log appropriately
        #page_index = int(string) - 1
        #save_to_s3(store_source, page_index, roster_row)
        #logger.info('Saved page _%s_', page_index)

        ## End core specific scraping code

        #####################################

        #Close the browser
        logger.info('complete!')

    except Exception as errorMessage:
        try:
            browser.close()
            record_error(message=str(errorMessage),
                         roster_row=roster_row,
                         browser=browser)
        except:
            record_error(message=str(errorMessage), roster_row=roster_row)

        # Record error in S3 for a general error
        logger.error('Error: %s', errorMessage)
        # Log error
        sys.exit(1)